mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
## Problem This is a prerequisite for neondatabase/neon#12575 to keep all things relevant to `build-tools` image in a single directory ## Summary of changes - Rename `build_tools/` to `build-tools/` - Move `build-tools.Dockerfile` to `build-tools/Dockerfile`
573 lines
18 KiB
Python
573 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import enum
|
|
import os
|
|
import shutil
|
|
from enum import StrEnum
|
|
from logging import debug
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, cast
|
|
|
|
# Docs are available at https://jsonnet.org/ref/bindings.html#python_api
|
|
import _jsonnet
|
|
import pytest
|
|
import requests
|
|
import yaml
|
|
from fixtures.log_helper import log
|
|
from fixtures.metrics import parse_metrics
|
|
from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
|
|
from fixtures.utils import wait_until
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Callable
|
|
from types import TracebackType
|
|
from typing import Self, TypedDict
|
|
|
|
from fixtures.endpoint.http import EndpointHttpClient
|
|
from fixtures.neon_fixtures import NeonEnv
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.port_distributor import PortDistributor
|
|
from prometheus_client.samples import Sample
|
|
|
|
class Metric(TypedDict):
|
|
metric_name: str
|
|
type: str
|
|
help: str
|
|
key_labels: list[str] | None
|
|
values: list[str] | None
|
|
query: str | None
|
|
query_ref: str | None
|
|
|
|
class Collector(TypedDict):
|
|
collector_name: str
|
|
metrics: list[Metric]
|
|
queries: list[Query] | None
|
|
|
|
class Query(TypedDict):
|
|
query_name: str
|
|
query: str
|
|
|
|
|
|
JSONNET_IMPORT_CACHE: dict[str, bytes] = {}
|
|
JSONNET_PATH: list[Path] = [BASE_DIR / "compute" / "jsonnet", COMPUTE_CONFIG_DIR]
|
|
|
|
|
|
def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
|
|
"""
|
|
dir: The directory of the Jsonnet file which tried to import a file
|
|
rel: The actual import path from Jsonnet
|
|
"""
|
|
if not rel:
|
|
raise RuntimeError("Empty filename")
|
|
|
|
full_path: str | None = None
|
|
if os.path.isabs(rel):
|
|
full_path = rel
|
|
else:
|
|
for p in (dir, *JSONNET_PATH):
|
|
assert isinstance(p, str | Path), "for mypy"
|
|
full_path = os.path.join(p, rel)
|
|
|
|
assert isinstance(full_path, str), "for mypy"
|
|
if not os.path.exists(full_path):
|
|
full_path = None
|
|
continue
|
|
|
|
break
|
|
|
|
if not full_path:
|
|
raise RuntimeError(f"Could not resolve import ({rel}) in {dir}")
|
|
|
|
if os.path.isdir(full_path):
|
|
raise RuntimeError(f"Attempted to import directory: {full_path}")
|
|
|
|
if full_path not in JSONNET_IMPORT_CACHE:
|
|
with open(full_path, encoding="utf-8") as f:
|
|
JSONNET_IMPORT_CACHE[full_path] = f.read().encode()
|
|
|
|
return full_path, JSONNET_IMPORT_CACHE[full_path]
|
|
|
|
|
|
def jsonnet_evaluate_file(
|
|
jsonnet_file: str | Path,
|
|
ext_vars: str | dict[str, str] | None = None,
|
|
tla_vars: str | dict[str, str] | None = None,
|
|
) -> str:
|
|
return cast(
|
|
"str",
|
|
_jsonnet.evaluate_file(
|
|
str(jsonnet_file),
|
|
ext_vars=ext_vars,
|
|
tla_vars=tla_vars,
|
|
import_callback=__import_callback,
|
|
),
|
|
)
|
|
|
|
|
|
def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str:
|
|
return jsonnet_evaluate_file(jsonnet_file, ext_vars={"pg_version": str(pg_version)})
|
|
|
|
|
|
def evaluate_config(
|
|
jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str
|
|
) -> str:
|
|
return jsonnet_evaluate_file(
|
|
jsonnet_file,
|
|
tla_vars={
|
|
"collector_name": collector_name,
|
|
"collector_file": str(collector_file),
|
|
"connection_string": connstr,
|
|
},
|
|
)
|
|
|
|
|
|
@enum.unique
|
|
class SqlExporterProcess(StrEnum):
|
|
COMPUTE = "compute"
|
|
AUTOSCALING = "autoscaling"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"collector_name",
|
|
["neon_collector", "neon_collector_autoscaling"],
|
|
ids=[SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
|
|
)
|
|
def test_sql_exporter_metrics_smoke(
|
|
pg_version: PgVersion,
|
|
neon_simple_env: NeonEnv,
|
|
compute_config_dir: Path,
|
|
collector_name: str,
|
|
):
|
|
"""
|
|
This is a smoke test to ensure the metrics SQL queries for sql_exporter
|
|
work without errors.
|
|
"""
|
|
env = neon_simple_env
|
|
|
|
endpoint = env.endpoints.create("main")
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
# Extract all the SQL queries from the sql_exporter config files, and run
|
|
# them.
|
|
collector = cast(
|
|
"Collector",
|
|
yaml.safe_load(
|
|
jsonnet_evaluate_file(
|
|
str(compute_config_dir / f"{collector_name}.jsonnet"),
|
|
ext_vars={"pg_version": pg_version},
|
|
)
|
|
),
|
|
)
|
|
|
|
for metric in collector["metrics"]:
|
|
query = metric.get("query")
|
|
if query is not None:
|
|
log.info("Checking query for metric %s in %s", metric["metric_name"], collector_name)
|
|
endpoint.safe_psql(query)
|
|
|
|
queries = collector.get("queries")
|
|
if queries is not None:
|
|
# This variable is named q because mypy is too silly to understand it is
|
|
# different from the query above.
|
|
#
|
|
# query: Optional[str]
|
|
# q: Metric
|
|
for q in queries:
|
|
log.info("Checking query %s in %s", q["query_name"], collector_name)
|
|
endpoint.safe_psql(q["query"])
|
|
|
|
|
|
class SqlExporterRunner:
|
|
def __init__(self, test_output_dir: Path, sql_exporter_port: int) -> None:
|
|
self._log_file_name = test_output_dir / "sql_exporter.stderr"
|
|
self._sql_exporter_port = sql_exporter_port
|
|
|
|
log.info(f"Starting sql_exporter at http://localhost:{self._sql_exporter_port}")
|
|
|
|
def start(self) -> None:
|
|
raise NotImplementedError()
|
|
|
|
def stop(self) -> None:
|
|
raise NotImplementedError()
|
|
|
|
def __enter__(self) -> Self:
|
|
self.start()
|
|
|
|
return self
|
|
|
|
def __exit__(
|
|
self,
|
|
exc_type: type[BaseException] | None,
|
|
exc: BaseException | None,
|
|
tb: TracebackType | None,
|
|
):
|
|
self.stop()
|
|
|
|
|
|
SQL_EXPORTER = shutil.which("sql_exporter")
|
|
|
|
if SQL_EXPORTER is None:
|
|
from testcontainers.core.container import DockerContainer
|
|
from testcontainers.core.waiting_utils import wait_for_logs
|
|
from typing_extensions import override
|
|
|
|
class SqlExporterContainer(DockerContainer): # type: ignore
|
|
def __init__(
|
|
self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
|
|
) -> None:
|
|
# NOTE: Keep the version the same as in
|
|
# compute/compute-node.Dockerfile and build-tools/Dockerfile.
|
|
#
|
|
# The "host" network mode allows sql_exporter to talk to the
|
|
# endpoint which is running on the host.
|
|
super().__init__("docker.io/burningalchemist/sql_exporter:0.17.3", network_mode="host")
|
|
|
|
self.__logs_dir = logs_dir
|
|
self.__port = port
|
|
|
|
config_file_name = config_file.name
|
|
collector_file_name = collector_file.name
|
|
|
|
self.with_command(f"-config.file=/etc/{config_file_name} -web.listen-address=:{port}")
|
|
|
|
container_config_file = f"/etc/{config_file_name}"
|
|
container_collector_file = f"/etc/{collector_file_name}"
|
|
log.info(
|
|
"Mapping %s to %s in sql_exporter container", config_file, container_config_file
|
|
)
|
|
log.info(
|
|
"Mapping %s to %s in sql_exporter container",
|
|
collector_file,
|
|
container_collector_file,
|
|
)
|
|
|
|
# NOTE: z allows Podman to work with SELinux. Please don't change it.
|
|
# Ideally this would be a ro (read-only) mount, but I couldn't seem to
|
|
# get it to work.
|
|
self.with_volume_mapping(str(config_file), container_config_file, "z")
|
|
self.with_volume_mapping(str(collector_file), container_collector_file, "z")
|
|
|
|
def start(self) -> Self:
|
|
super().start()
|
|
|
|
log.info("Waiting for sql_exporter to be ready")
|
|
wait_for_logs(
|
|
self,
|
|
rf'msg="Listening on" address=\[::\]:{self.__port}',
|
|
timeout=5,
|
|
)
|
|
|
|
return self
|
|
|
|
class SqlExporterContainerRunner(SqlExporterRunner):
|
|
def __init__(
|
|
self,
|
|
test_output_dir: Path,
|
|
config_file: Path,
|
|
collector_file: Path,
|
|
sql_exporter_port: int,
|
|
) -> None:
|
|
super().__init__(test_output_dir, sql_exporter_port)
|
|
|
|
self.__container = SqlExporterContainer(
|
|
test_output_dir, config_file, collector_file, sql_exporter_port
|
|
)
|
|
|
|
@override
|
|
def start(self) -> None:
|
|
self.__container.start()
|
|
|
|
@override
|
|
def stop(self) -> None:
|
|
try:
|
|
# sql_exporter doesn't print anything to stdout
|
|
with open(self._log_file_name, "w", encoding="utf-8") as f:
|
|
f.write(self.__container.get_logs()[1].decode())
|
|
except Exception:
|
|
log.exception("Failed to write sql_exporter logs")
|
|
|
|
# Stop the container *after* getting the logs
|
|
self.__container.stop()
|
|
|
|
else:
|
|
import subprocess
|
|
import time
|
|
from signal import Signals
|
|
|
|
from typing_extensions import override
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Mapping
|
|
|
|
class SqlExporterNativeRunner(SqlExporterRunner):
|
|
def __init__(
|
|
self,
|
|
test_output_dir: Path,
|
|
config_file: Path,
|
|
collector_file: Path,
|
|
sql_exporter_port: int,
|
|
) -> None:
|
|
super().__init__(test_output_dir, sql_exporter_port)
|
|
|
|
self.__config_file = config_file
|
|
self.__collector_file = collector_file
|
|
self.__proc: subprocess.Popen[str]
|
|
|
|
@override
|
|
def start(self) -> None:
|
|
assert SQL_EXPORTER is not None
|
|
|
|
log_file = open(self._log_file_name, "w", encoding="utf-8")
|
|
self.__proc = subprocess.Popen(
|
|
[
|
|
os.path.realpath(SQL_EXPORTER),
|
|
f"-config.file={self.__config_file}",
|
|
f"-web.listen-address=:{self._sql_exporter_port}",
|
|
],
|
|
# If PGSERVICEFILE is set, sql_exporter won't launch.
|
|
env=cast("Mapping[str, str]", {}),
|
|
stderr=log_file,
|
|
bufsize=0,
|
|
text=True,
|
|
)
|
|
|
|
log.info("Waiting for sql_exporter to be ready")
|
|
|
|
with open(self._log_file_name, encoding="utf-8") as f:
|
|
started = time.time()
|
|
while True:
|
|
if time.time() - started > 5:
|
|
self.__proc.kill()
|
|
raise RuntimeError("sql_exporter did not start up properly")
|
|
|
|
line = f.readline()
|
|
if not line:
|
|
time.sleep(0.5)
|
|
continue
|
|
|
|
if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
|
|
break
|
|
|
|
@override
|
|
def stop(self) -> None:
|
|
self.__proc.send_signal(Signals.SIGINT)
|
|
self.__proc.wait()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"exporter",
|
|
[SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
|
|
)
|
|
def test_sql_exporter_metrics_e2e(
|
|
pg_version: PgVersion,
|
|
neon_simple_env: NeonEnv,
|
|
test_output_dir: Path,
|
|
compute_config_dir: Path,
|
|
exporter: SqlExporterProcess,
|
|
port_distributor: PortDistributor,
|
|
):
|
|
"""
|
|
This is a full E2E test of the sql_exporter setup to make sure it works
|
|
without error.
|
|
|
|
If you use Podman instead of Docker, you may run into issues. If you run
|
|
rootful Podman, you may need to add a ~/.testcontainers.properties file
|
|
with the following content:
|
|
|
|
ryuk.container.privileged=true
|
|
|
|
If you are not running rootful Podman, set the following environment
|
|
variable:
|
|
|
|
TESTCONTAINERS_RYUK_DISABLED=true
|
|
|
|
Note that you will need the Podman socket to be running. On a systemd-based
|
|
system, that command will look something like:
|
|
|
|
# Use `enable --now` to start the socket on login and immediately.
|
|
systemctl --user start podman.socket
|
|
|
|
Whether you use the user service manager or the system service manager is
|
|
up to you, but may have implications on the above ryuk related steps. Note
|
|
that you may also need the docker(1) Podman frontend. I am unsure if the
|
|
docker Python package supports Podman natively.
|
|
"""
|
|
env = neon_simple_env
|
|
|
|
endpoint = env.endpoints.create("main")
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
if exporter == SqlExporterProcess.COMPUTE:
|
|
stem_suffix = ""
|
|
elif exporter == SqlExporterProcess.AUTOSCALING:
|
|
stem_suffix = "_autoscaling"
|
|
|
|
# Write the collector file
|
|
collector_file = test_output_dir / f"neon_collector{stem_suffix}.yml"
|
|
with open(collector_file, "w", encoding="utf-8") as o:
|
|
collector = evaluate_collector(
|
|
compute_config_dir / f"neon_collector{stem_suffix}.jsonnet", pg_version
|
|
)
|
|
o.write(collector)
|
|
|
|
conn_options = endpoint.conn_options()
|
|
pg_host = conn_options["host"]
|
|
pg_port = conn_options["port"]
|
|
pg_user = conn_options["user"]
|
|
pg_dbname = conn_options["dbname"]
|
|
pg_application_name = f"sql_exporter{stem_suffix}"
|
|
connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}&pgaudit.log=none"
|
|
|
|
def escape_go_filepath_match_characters(s: str) -> str:
|
|
"""
|
|
Unfortunately sql_exporter doesn't use plain file paths, so we need to
|
|
escape special characters. pytest encodes the parameters of a test using
|
|
[ and ], so we need to escape them with backslashes.
|
|
See https://pkg.go.dev/path/filepath#Match.
|
|
"""
|
|
return s.replace("[", r"\[").replace("]", r"\]")
|
|
|
|
# Write the config file
|
|
config_file = test_output_dir / f"sql_exporter{stem_suffix}.yml"
|
|
with open(config_file, "w", encoding="utf-8") as o:
|
|
config = evaluate_config(
|
|
compute_config_dir / "sql_exporter.jsonnet",
|
|
collector_name=collector_file.stem,
|
|
collector_file=escape_go_filepath_match_characters(str(collector_file))
|
|
if SQL_EXPORTER
|
|
else collector_file.name,
|
|
connstr=connstr,
|
|
)
|
|
o.write(config)
|
|
|
|
sql_exporter_port = port_distributor.get_port()
|
|
with (SqlExporterNativeRunner if SQL_EXPORTER else SqlExporterContainerRunner)(
|
|
test_output_dir, config_file, collector_file, sql_exporter_port
|
|
) as _runner:
|
|
resp = requests.get(f"http://localhost:{sql_exporter_port}/metrics")
|
|
resp.raise_for_status()
|
|
|
|
|
|
def test_perf_counters(neon_simple_env: NeonEnv):
|
|
"""
|
|
Test compute metrics, exposed in the neon_backend_perf_counters and
|
|
neon_perf_counters views
|
|
"""
|
|
env = neon_simple_env
|
|
endpoint = env.endpoints.create_start("main")
|
|
|
|
conn = endpoint.connect()
|
|
cur = conn.cursor()
|
|
|
|
# We don't check that the values make sense, this is just a very
|
|
# basic check that the server doesn't crash or something like that.
|
|
#
|
|
# 1.5 is the minimum version to contain these views.
|
|
cur.execute("CREATE EXTENSION neon VERSION '1.5'")
|
|
cur.execute("set neon.monitor_query_exec_time = on")
|
|
cur.execute("SELECT * FROM neon_perf_counters")
|
|
cur.execute("SELECT * FROM neon_backend_perf_counters")
|
|
cur.execute(
|
|
"select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()"
|
|
)
|
|
assert cur.fetchall()[0][0] == 2
|
|
|
|
|
|
def collect_metric(
|
|
client: EndpointHttpClient,
|
|
name: str,
|
|
filter: dict[str, str],
|
|
predicate: Callable[[list[Sample]], bool],
|
|
) -> Callable[[], list[Sample]]:
|
|
"""
|
|
Call this function as the first argument to wait_until().
|
|
"""
|
|
|
|
def __collect_metric() -> list[Sample]:
|
|
resp = client.metrics()
|
|
debug("Metrics: %s", resp)
|
|
m = parse_metrics(resp)
|
|
samples = m.query_all(name, filter)
|
|
debug("Samples: %s", samples)
|
|
assert predicate(samples), "predicate failed"
|
|
return samples
|
|
|
|
return __collect_metric
|
|
|
|
|
|
def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
|
|
"""
|
|
Test that the compute_installed_extensions properly reports accurate
|
|
results. Important to note that currently this metric is only gathered on
|
|
compute start. We install the neon extension into a database other than
|
|
postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during
|
|
Postgres startup in the postgres database, creating a race condition.
|
|
"""
|
|
DB_NAME = "test"
|
|
|
|
env = neon_simple_env
|
|
|
|
endpoint = env.endpoints.create_start("main")
|
|
endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}")
|
|
|
|
# The metric is only gathered on compute start, so restart to check that
|
|
# plpgsql is now in 3 databases, instead of its regular 2, template1 and
|
|
# postgres.
|
|
endpoint.stop()
|
|
endpoint.start()
|
|
|
|
client = endpoint.http_client()
|
|
|
|
def __has_plpgsql(samples: list[Sample]) -> bool:
|
|
"""
|
|
Check that plpgsql is installed in the template1, postgres, and test
|
|
databases
|
|
"""
|
|
return len(samples) == 1 and samples[0].value == 3
|
|
|
|
wait_until(
|
|
collect_metric(
|
|
client,
|
|
"compute_installed_extensions",
|
|
{"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
|
|
__has_plpgsql,
|
|
),
|
|
name="compute_installed_extensions",
|
|
)
|
|
|
|
# Install the neon extension, so we can check for it on the restart.
|
|
endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME)
|
|
|
|
# The metric is only gathered on compute start, so restart to check if the
|
|
# neon extension will now be there.
|
|
endpoint.stop()
|
|
endpoint.start()
|
|
|
|
client = endpoint.http_client()
|
|
|
|
def __has_neon(samples: list[Sample]) -> bool:
|
|
return len(samples) == 1 and samples[0].value == 1
|
|
|
|
wait_until(
|
|
collect_metric(
|
|
client,
|
|
"compute_installed_extensions",
|
|
{"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"},
|
|
__has_neon,
|
|
),
|
|
name="compute_installed_extensions",
|
|
)
|
|
|
|
# Double check that we also still have plpgsql
|
|
wait_until(
|
|
collect_metric(
|
|
client,
|
|
"compute_installed_extensions",
|
|
{"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
|
|
__has_plpgsql,
|
|
),
|
|
name="compute_installed_extensions",
|
|
)
|