mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 05:22:56 +00:00
Add Safekeeper metrics tests (#746)
* zenith_fixtures.py: add SafekeeperHttpClient.get_metrics() * Ensure that `collect_lsn` and `flush_lsn`'s reported values look reasonable in `test_many_timelines`
This commit is contained in:
@@ -3,13 +3,17 @@ import random
|
|||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from multiprocessing import Process, Value
|
from multiprocessing import Process, Value
|
||||||
from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
|
from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
|
||||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||||
|
|
||||||
@@ -34,13 +38,22 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
|
|||||||
assert cur.fetchone() == (5000050000, )
|
assert cur.fetchone() == (5000050000, )
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BranchMetrics:
|
||||||
|
name: str
|
||||||
|
latest_valid_lsn: int
|
||||||
|
# One entry per each Safekeeper, order is the same
|
||||||
|
flush_lsns: List[int] = field(default_factory=list)
|
||||||
|
commit_lsns: List[int] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
# Run page server and multiple acceptors, and multiple compute nodes running
|
# Run page server and multiple acceptors, and multiple compute nodes running
|
||||||
# against different timelines.
|
# against different timelines.
|
||||||
def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
||||||
zenith_env_builder.num_safekeepers = 3
|
zenith_env_builder.num_safekeepers = 3
|
||||||
env = zenith_env_builder.init()
|
env = zenith_env_builder.init()
|
||||||
|
|
||||||
n_timelines = 2
|
n_timelines = 3
|
||||||
|
|
||||||
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
|
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
|
||||||
|
|
||||||
@@ -50,21 +63,114 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
|||||||
env.zenith_cli(["branch", branch, "main"])
|
env.zenith_cli(["branch", branch, "main"])
|
||||||
pgs.append(env.postgres.create_start(branch))
|
pgs.append(env.postgres.create_start(branch))
|
||||||
|
|
||||||
|
tenant_id = uuid.UUID(env.initial_tenant)
|
||||||
|
|
||||||
|
def collect_metrics(message: str) -> List[BranchMetrics]:
|
||||||
|
with env.pageserver.http_client() as pageserver_http:
|
||||||
|
branch_details = [
|
||||||
|
pageserver_http.branch_detail(tenant_id=tenant_id, name=branch)
|
||||||
|
for branch in branches
|
||||||
|
]
|
||||||
|
# All changes visible to pageserver (latest_valid_lsn) should be
|
||||||
|
# confirmed by safekeepers first. As we cannot atomically get
|
||||||
|
# state of both pageserver and safekeepers, we should start with
|
||||||
|
# pageserver. Looking at outdated data from pageserver is ok.
|
||||||
|
# Asking safekeepers first is not ok because new commits may arrive
|
||||||
|
# to both safekeepers and pageserver after we've already obtained
|
||||||
|
# safekeepers' state, it will look contradictory.
|
||||||
|
sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]
|
||||||
|
|
||||||
|
branch_metrics = []
|
||||||
|
with env.pageserver.http_client() as pageserver_http:
|
||||||
|
for branch_detail in branch_details:
|
||||||
|
timeline_id: str = branch_detail["timeline_id"]
|
||||||
|
|
||||||
|
m = BranchMetrics(
|
||||||
|
name=branch_detail["name"],
|
||||||
|
latest_valid_lsn=branch_detail["latest_valid_lsn"],
|
||||||
|
)
|
||||||
|
for sk_m in sk_metrics:
|
||||||
|
m.flush_lsns.append(sk_m.flush_lsn_inexact[timeline_id])
|
||||||
|
m.commit_lsns.append(sk_m.commit_lsn_inexact[timeline_id])
|
||||||
|
|
||||||
|
for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
|
||||||
|
# Invariant. May be < when transaction is in progress.
|
||||||
|
assert commit_lsn <= flush_lsn
|
||||||
|
# We only call collect_metrics() after a transaction is confirmed by
|
||||||
|
# the compute node, which only happens after a consensus of safekeepers
|
||||||
|
# has confirmed the transaction. We assume majority consensus here.
|
||||||
|
assert (2 * sum(m.latest_valid_lsn <= lsn
|
||||||
|
for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
|
||||||
|
assert (2 * sum(m.latest_valid_lsn <= lsn
|
||||||
|
for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
|
||||||
|
branch_metrics.append(m)
|
||||||
|
log.info(f"{message}: {branch_metrics}")
|
||||||
|
return branch_metrics
|
||||||
|
|
||||||
|
# TODO: https://github.com/zenithdb/zenith/issues/809
|
||||||
|
# collect_metrics("before CREATE TABLE")
|
||||||
|
|
||||||
# Do everything in different loops to have actions on different timelines
|
# Do everything in different loops to have actions on different timelines
|
||||||
# interleaved.
|
# interleaved.
|
||||||
# create schema
|
# create schema
|
||||||
for pg in pgs:
|
for pg in pgs:
|
||||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||||
|
init_m = collect_metrics("after CREATE TABLE")
|
||||||
|
|
||||||
# Populate data
|
# Populate data for 2/3 branches
|
||||||
for pg in pgs:
|
class MetricsChecker(threading.Thread):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__(daemon=True)
|
||||||
|
self.should_stop = threading.Event()
|
||||||
|
self.exception: Optional[BaseException] = None
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
try:
|
||||||
|
while not self.should_stop.is_set():
|
||||||
|
collect_metrics("during INSERT INTO")
|
||||||
|
time.sleep(1)
|
||||||
|
except:
|
||||||
|
log.error("MetricsChecker's thread failed, the test will be failed on .stop() call",
|
||||||
|
exc_info=True)
|
||||||
|
# We want to preserve traceback as well as the exception
|
||||||
|
exc_type, exc_value, exc_tb = sys.exc_info()
|
||||||
|
assert exc_type
|
||||||
|
e = exc_type(exc_value)
|
||||||
|
e.__traceback__ = exc_tb
|
||||||
|
self.exception = e
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self.should_stop.set()
|
||||||
|
self.join()
|
||||||
|
if self.exception:
|
||||||
|
raise self.exception
|
||||||
|
|
||||||
|
metrics_checker = MetricsChecker()
|
||||||
|
metrics_checker.start()
|
||||||
|
|
||||||
|
for pg in pgs[:-1]:
|
||||||
pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||||
|
|
||||||
# Check data
|
metrics_checker.stop()
|
||||||
for pg in pgs:
|
|
||||||
|
collect_metrics("after INSERT INTO")
|
||||||
|
|
||||||
|
# Check data for 2/3 branches
|
||||||
|
for pg in pgs[:-1]:
|
||||||
res = pg.safe_psql("SELECT sum(key) FROM t")
|
res = pg.safe_psql("SELECT sum(key) FROM t")
|
||||||
assert res[0] == (5000050000, )
|
assert res[0] == (5000050000, )
|
||||||
|
|
||||||
|
final_m = collect_metrics("after SELECT")
|
||||||
|
# Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly.
|
||||||
|
# Also assume that safekeepers will not be significantly out of sync in this test.
|
||||||
|
middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2
|
||||||
|
assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns)
|
||||||
|
assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns)
|
||||||
|
assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns)
|
||||||
|
assert max(init_m[1].commit_lsns) < middle_lsn < min(final_m[1].commit_lsns)
|
||||||
|
assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
|
||||||
|
assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
|
||||||
|
|
||||||
|
|
||||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from cached_property import cached_property
|
from cached_property import cached_property
|
||||||
import asyncpg
|
import asyncpg
|
||||||
import os
|
import os
|
||||||
@@ -11,6 +11,7 @@ import jwt
|
|||||||
import json
|
import json
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
@@ -715,7 +716,7 @@ class ZenithPageserver(PgProtocol):
|
|||||||
def __exit__(self, exc_type, exc, tb):
|
def __exit__(self, exc_type, exc, tb):
|
||||||
self.stop(True)
|
self.stop(True)
|
||||||
|
|
||||||
def http_client(self, auth_token: Optional[str] = None):
|
def http_client(self, auth_token: Optional[str] = None) -> ZenithPageserverHttpClient:
|
||||||
return ZenithPageserverHttpClient(
|
return ZenithPageserverHttpClient(
|
||||||
port=self.service_port.http,
|
port=self.service_port.http,
|
||||||
auth_token=auth_token,
|
auth_token=auth_token,
|
||||||
@@ -1084,7 +1085,7 @@ class Safekeeper:
|
|||||||
assert isinstance(res, dict)
|
assert isinstance(res, dict)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def http_client(self):
|
def http_client(self) -> SafekeeperHttpClient:
|
||||||
return SafekeeperHttpClient(port=self.port.http)
|
return SafekeeperHttpClient(port=self.port.http)
|
||||||
|
|
||||||
|
|
||||||
@@ -1094,6 +1095,14 @@ class SafekeeperTimelineStatus:
|
|||||||
flush_lsn: str
|
flush_lsn: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SafekeeperMetrics:
|
||||||
|
# These are metrics from Prometheus which uses float64 internally.
|
||||||
|
# As a consequence, values may differ from real original int64s.
|
||||||
|
flush_lsn_inexact: Dict[str, int] = field(default_factory=dict)
|
||||||
|
commit_lsn_inexact: Dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
class SafekeeperHttpClient(requests.Session):
|
class SafekeeperHttpClient(requests.Session):
|
||||||
def __init__(self, port: int) -> None:
|
def __init__(self, port: int) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -1109,6 +1118,22 @@ class SafekeeperHttpClient(requests.Session):
|
|||||||
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
||||||
flush_lsn=resj['flush_lsn'])
|
flush_lsn=resj['flush_lsn'])
|
||||||
|
|
||||||
|
def get_metrics(self) -> SafekeeperMetrics:
|
||||||
|
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||||
|
request_result.raise_for_status()
|
||||||
|
all_metrics_text = request_result.text
|
||||||
|
|
||||||
|
metrics = SafekeeperMetrics()
|
||||||
|
for match in re.finditer(r'^safekeeper_flush_lsn{ztli="([0-9a-f]+)"} (\S+)$',
|
||||||
|
all_metrics_text,
|
||||||
|
re.MULTILINE):
|
||||||
|
metrics.flush_lsn_inexact[match.group(1)] = int(match.group(2))
|
||||||
|
for match in re.finditer(r'^safekeeper_commit_lsn{ztli="([0-9a-f]+)"} (\S+)$',
|
||||||
|
all_metrics_text,
|
||||||
|
re.MULTILINE):
|
||||||
|
metrics.commit_lsn_inexact[match.group(1)] = int(match.group(2))
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
def get_test_output_dir(request: Any) -> str:
|
def get_test_output_dir(request: Any) -> str:
|
||||||
""" Compute the working directory for an individual test. """
|
""" Compute the working directory for an individual test. """
|
||||||
|
|||||||
Reference in New Issue
Block a user