Measure cross-AZ traffic in safekeepers (#3806)

Create `safekeeper_pg_io_bytes_total` metric to track total amount of
bytes written/read in a postgres connections to safekeepers. This metric
has the following labels:
- `client_az` – availability zone of the connection initiator, or
`"unknown"`
- `sk_az` – availability zone of the safekeeper, or `"unknown"`
- `app_name` – `application_name` of the postgres client
- `dir` – data direction, either `"read"` or `"write"`
- `same_az` – `"true"`, `"false"` or `"unknown"`. Can be derived from
`client_az` and `sk_az`, exists purely for convenience.

This is implemented by passing availability zone in the connection
string, like this: `-c tenant_id=AAA timeline_id=BBB
availability-zone=AZ-1`.

Update ansible deployment scripts to add availability_zone argument
to safekeeper and pageserver in systemd service files.
This commit is contained in:
Arthur Petukhovsky
2023-03-16 16:24:01 +02:00
committed by GitHub
parent 768c8d9972
commit b067378d0d
13 changed files with 263 additions and 17 deletions

View File

@@ -3,6 +3,7 @@ import shutil
import time
from contextlib import closing
from datetime import datetime
from itertools import chain
from pathlib import Path
from typing import List
@@ -87,6 +88,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'"
env = neon_env_builder.init_start()
tenant_1, _ = env.neon_cli.create_tenant()
@@ -122,6 +124,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
ps_metrics = all_metrics[0]
sk_metrics = all_metrics[1:]
# Find all metrics among all safekeepers, accepts the same arguments as query_all()
def query_all_safekeepers(name, filter):
return list(
chain.from_iterable(
map(
lambda sk: sk.query_all(name, filter),
sk_metrics,
)
)
)
ttids = [
{"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)},
{"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)},
@@ -162,6 +175,40 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
)
for io_direction in ["read", "write"]:
# Querying all metrics for number of bytes read/written by pageserver in another AZ
io_metrics = query_all_safekeepers(
"safekeeper_pg_io_bytes_total",
{
"app_name": "pageserver",
"client_az": "test_ps_az",
"dir": io_direction,
"same_az": "false",
},
)
total_bytes = sum(int(metric.value) for metric in io_metrics)
log.info(f"Pageserver {io_direction} bytes from another AZ: {total_bytes}")
# We expect some bytes to be read/written, to make sure metrics are working
assert total_bytes > 0
# Test (a subset of) safekeeper global metrics
for sk_m in sk_metrics:
# Test that every safekeeper has read some bytes
assert any(
map(
lambda x: x.value > 0,
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "read"}),
)
), f"{sk_m.name} has not read bytes"
# Test that every safekeeper has written some bytes
assert any(
map(
lambda x: x.value > 0,
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "write"}),
)
), f"{sk_m.name} has not written bytes"
# Test (a subset of) pageserver global metrics
for metric in PAGESERVER_GLOBAL_METRICS:
ps_samples = ps_metrics.query_all(metric, {})