mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
Measure cross-AZ traffic in safekeepers (#3806)
Create `safekeeper_pg_io_bytes_total` metric to track total amount of bytes written/read in a postgres connections to safekeepers. This metric has the following labels: - `client_az` – availability zone of the connection initiator, or `"unknown"` - `sk_az` – availability zone of the safekeeper, or `"unknown"` - `app_name` – `application_name` of the postgres client - `dir` – data direction, either `"read"` or `"write"` - `same_az` – `"true"`, `"false"` or `"unknown"`. Can be derived from `client_az` and `sk_az`, exists purely for convenience. This is implemented by passing availability zone in the connection string, like this: `-c tenant_id=AAA timeline_id=BBB availability-zone=AZ-1`. Update ansible deployment scripts to add availability_zone argument to safekeeper and pageserver in systemd service files.
This commit is contained in:
committed by
GitHub
parent
768c8d9972
commit
b067378d0d
@@ -3,6 +3,7 @@ import shutil
|
||||
import time
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@@ -87,6 +88,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_1, _ = env.neon_cli.create_tenant()
|
||||
@@ -122,6 +124,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
ps_metrics = all_metrics[0]
|
||||
sk_metrics = all_metrics[1:]
|
||||
|
||||
# Find all metrics among all safekeepers, accepts the same arguments as query_all()
|
||||
def query_all_safekeepers(name, filter):
|
||||
return list(
|
||||
chain.from_iterable(
|
||||
map(
|
||||
lambda sk: sk.query_all(name, filter),
|
||||
sk_metrics,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
ttids = [
|
||||
{"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)},
|
||||
{"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)},
|
||||
@@ -162,6 +175,40 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
|
||||
)
|
||||
|
||||
for io_direction in ["read", "write"]:
|
||||
# Querying all metrics for number of bytes read/written by pageserver in another AZ
|
||||
io_metrics = query_all_safekeepers(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
{
|
||||
"app_name": "pageserver",
|
||||
"client_az": "test_ps_az",
|
||||
"dir": io_direction,
|
||||
"same_az": "false",
|
||||
},
|
||||
)
|
||||
total_bytes = sum(int(metric.value) for metric in io_metrics)
|
||||
log.info(f"Pageserver {io_direction} bytes from another AZ: {total_bytes}")
|
||||
# We expect some bytes to be read/written, to make sure metrics are working
|
||||
assert total_bytes > 0
|
||||
|
||||
# Test (a subset of) safekeeper global metrics
|
||||
for sk_m in sk_metrics:
|
||||
# Test that every safekeeper has read some bytes
|
||||
assert any(
|
||||
map(
|
||||
lambda x: x.value > 0,
|
||||
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "read"}),
|
||||
)
|
||||
), f"{sk_m.name} has not read bytes"
|
||||
|
||||
# Test that every safekeeper has written some bytes
|
||||
assert any(
|
||||
map(
|
||||
lambda x: x.value > 0,
|
||||
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "write"}),
|
||||
)
|
||||
), f"{sk_m.name} has not written bytes"
|
||||
|
||||
# Test (a subset of) pageserver global metrics
|
||||
for metric in PAGESERVER_GLOBAL_METRICS:
|
||||
ps_samples = ps_metrics.query_all(metric, {})
|
||||
|
||||
Reference in New Issue
Block a user