neon/test_runner/regress/test_compute_metrics.py

from __future__ import annotations

import enum
import os
import shutil
from enum import StrEnum
from logging import debug
from pathlib import Path
from typing import TYPE_CHECKING, cast

# Docs are available at https://jsonnet.org/ref/bindings.html#python_api
import _jsonnet
import pytest
import requests
import yaml
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
from fixtures.utils import wait_until

if TYPE_CHECKING:
    from collections.abc import Callable
    from types import TracebackType
    from typing import Self, TypedDict

    from fixtures.endpoint.http import EndpointHttpClient
    from fixtures.neon_fixtures import NeonEnv
    from fixtures.pg_version import PgVersion
    from fixtures.port_distributor import PortDistributor
    from prometheus_client.samples import Sample

    class Metric(TypedDict):
        metric_name: str
        type: str
        help: str
        key_labels: list[str] | None
        values: list[str] | None
        query: str | None
        query_ref: str | None

    class Collector(TypedDict):
        collector_name: str
        metrics: list[Metric]
        queries: list[Query] | None

    class Query(TypedDict):
        query_name: str
        query: str


JSONNET_IMPORT_CACHE: dict[str, bytes] = {}
JSONNET_PATH: list[Path] = [BASE_DIR / "compute" / "jsonnet", COMPUTE_CONFIG_DIR]


def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
    """
    dir: The directory of the Jsonnet file which tried to import a file
    rel: The actual import path from Jsonnet
    """
    if not rel:
        raise RuntimeError("Empty filename")

    full_path: str | None = None
    if os.path.isabs(rel):
        full_path = rel
    else:
        for p in (dir, *JSONNET_PATH):
            assert isinstance(p, str | Path), "for mypy"
            full_path = os.path.join(p, rel)

            assert isinstance(full_path, str), "for mypy"
            if not os.path.exists(full_path):
                full_path = None
                continue

            break

        if not full_path:
            raise RuntimeError(f"Could not resolve import ({rel}) in {dir}")

    if os.path.isdir(full_path):
        raise RuntimeError(f"Attempted to import directory: {full_path}")

    if full_path not in JSONNET_IMPORT_CACHE:
        with open(full_path, encoding="utf-8") as f:
            JSONNET_IMPORT_CACHE[full_path] = f.read().encode()

    return full_path, JSONNET_IMPORT_CACHE[full_path]


def jsonnet_evaluate_file(
    jsonnet_file: str | Path,
    ext_vars: str | dict[str, str] | None = None,
    tla_vars: str | dict[str, str] | None = None,
) -> str:
    return cast(
        "str",
        _jsonnet.evaluate_file(
            str(jsonnet_file),
            ext_vars=ext_vars,
            tla_vars=tla_vars,
            import_callback=__import_callback,
        ),
    )


def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str:
    return jsonnet_evaluate_file(jsonnet_file, ext_vars={"pg_version": str(pg_version)})


def evaluate_config(
    jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str
) -> str:
    return jsonnet_evaluate_file(
        jsonnet_file,
        tla_vars={
            "collector_name": collector_name,
            "collector_file": str(collector_file),
            "connection_string": connstr,
        },
    )


@enum.unique
class SqlExporterProcess(StrEnum):
    COMPUTE = "compute"
    AUTOSCALING = "autoscaling"


@pytest.mark.parametrize(
    "collector_name",
    ["neon_collector", "neon_collector_autoscaling"],
    ids=[SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
)
def test_sql_exporter_metrics_smoke(
    pg_version: PgVersion,
    neon_simple_env: NeonEnv,
    compute_config_dir: Path,
    collector_name: str,
):
    """
    This is a smoke test to ensure the metrics SQL queries for sql_exporter
    work without errors.
    """
    env = neon_simple_env

    endpoint = env.endpoints.create("main")
    endpoint.respec(skip_pg_catalog_updates=False)
    endpoint.start()

    # Extract all the SQL queries from the sql_exporter config files, and run
    # them.
    collector = cast(
        "Collector",
        yaml.safe_load(
            jsonnet_evaluate_file(
                str(compute_config_dir / f"{collector_name}.jsonnet"),
                ext_vars={"pg_version": pg_version},
            )
        ),
    )

    for metric in collector["metrics"]:
        query = metric.get("query")
        if query is not None:
            log.info("Checking query for metric %s in %s", metric["metric_name"], collector_name)
            endpoint.safe_psql(query)

    queries = collector.get("queries")
    if queries is not None:
        # This variable is named q because mypy is too silly to understand it is
        # different from the query above.
        #
        # query: Optional[str]
        # q: Metric
        for q in queries:
            log.info("Checking query %s in %s", q["query_name"], collector_name)
            endpoint.safe_psql(q["query"])


class SqlExporterRunner:
    def __init__(self, test_output_dir: Path, sql_exporter_port: int) -> None:
        self._log_file_name = test_output_dir / "sql_exporter.stderr"
        self._sql_exporter_port = sql_exporter_port

        log.info(f"Starting sql_exporter at http://localhost:{self._sql_exporter_port}")

    def start(self) -> None:
        raise NotImplementedError()

    def stop(self) -> None:
        raise NotImplementedError()

    def __enter__(self) -> Self:
        self.start()

        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc: BaseException | None,
        tb: TracebackType | None,
    ):
        self.stop()


SQL_EXPORTER = shutil.which("sql_exporter")

if SQL_EXPORTER is None:
    from testcontainers.core.container import DockerContainer
    from testcontainers.core.waiting_utils import wait_for_logs
    from typing_extensions import override

    class SqlExporterContainer(DockerContainer):  # type: ignore
        def __init__(
            self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
        ) -> None:
            # NOTE: Keep the version the same as in
            # compute/compute-node.Dockerfile and build-tools/Dockerfile.
            #
            # The "host" network mode allows sql_exporter to talk to the
            # endpoint which is running on the host.
            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.3", network_mode="host")

            self.__logs_dir = logs_dir
            self.__port = port

            config_file_name = config_file.name
            collector_file_name = collector_file.name

            self.with_command(f"-config.file=/etc/{config_file_name} -web.listen-address=:{port}")

            container_config_file = f"/etc/{config_file_name}"
            container_collector_file = f"/etc/{collector_file_name}"
            log.info(
                "Mapping %s to %s in sql_exporter container", config_file, container_config_file
            )
            log.info(
                "Mapping %s to %s in sql_exporter container",
                collector_file,
                container_collector_file,
            )

            # NOTE: z allows Podman to work with SELinux. Please don't change it.
            # Ideally this would be a ro (read-only) mount, but I couldn't seem to
            # get it to work.
            self.with_volume_mapping(str(config_file), container_config_file, "z")
            self.with_volume_mapping(str(collector_file), container_collector_file, "z")

        def start(self) -> Self:
            super().start()

            log.info("Waiting for sql_exporter to be ready")
            wait_for_logs(
                self,
                rf'msg="Listening on" address=\[::\]:{self.__port}',
                timeout=5,
            )

            return self

    class SqlExporterContainerRunner(SqlExporterRunner):
        def __init__(
            self,
            test_output_dir: Path,
            config_file: Path,
            collector_file: Path,
            sql_exporter_port: int,
        ) -> None:
            super().__init__(test_output_dir, sql_exporter_port)

            self.__container = SqlExporterContainer(
                test_output_dir, config_file, collector_file, sql_exporter_port
            )

        @override
        def start(self) -> None:
            self.__container.start()

        @override
        def stop(self) -> None:
            try:
                # sql_exporter doesn't print anything to stdout
                with open(self._log_file_name, "w", encoding="utf-8") as f:
                    f.write(self.__container.get_logs()[1].decode())
            except Exception:
                log.exception("Failed to write sql_exporter logs")

            # Stop the container *after* getting the logs
            self.__container.stop()

else:
    import subprocess
    import time
    from signal import Signals

    from typing_extensions import override

    if TYPE_CHECKING:
        from collections.abc import Mapping

    class SqlExporterNativeRunner(SqlExporterRunner):
        def __init__(
            self,
            test_output_dir: Path,
            config_file: Path,
            collector_file: Path,
            sql_exporter_port: int,
        ) -> None:
            super().__init__(test_output_dir, sql_exporter_port)

            self.__config_file = config_file
            self.__collector_file = collector_file
            self.__proc: subprocess.Popen[str]

        @override
        def start(self) -> None:
            assert SQL_EXPORTER is not None

            log_file = open(self._log_file_name, "w", encoding="utf-8")
            self.__proc = subprocess.Popen(
                [
                    os.path.realpath(SQL_EXPORTER),
                    f"-config.file={self.__config_file}",
                    f"-web.listen-address=:{self._sql_exporter_port}",
                ],
                # If PGSERVICEFILE is set, sql_exporter won't launch.
                env=cast("Mapping[str, str]", {}),
                stderr=log_file,
                bufsize=0,
                text=True,
            )

            log.info("Waiting for sql_exporter to be ready")

            with open(self._log_file_name, encoding="utf-8") as f:
                started = time.time()
                while True:
                    if time.time() - started > 5:
                        self.__proc.kill()
                        raise RuntimeError("sql_exporter did not start up properly")

                    line = f.readline()
                    if not line:
                        time.sleep(0.5)
                        continue

                    if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
                        break

        @override
        def stop(self) -> None:
            self.__proc.send_signal(Signals.SIGINT)
            self.__proc.wait()


@pytest.mark.parametrize(
    "exporter",
    [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
)
def test_sql_exporter_metrics_e2e(
    pg_version: PgVersion,
    neon_simple_env: NeonEnv,
    test_output_dir: Path,
    compute_config_dir: Path,
    exporter: SqlExporterProcess,
    port_distributor: PortDistributor,
):
    """
    This is a full E2E test of the sql_exporter setup to make sure it works
    without error.

    If you use Podman instead of Docker, you may run into issues. If you run
    rootful Podman, you may need to add a ~/.testcontainers.properties file
    with the following content:

        ryuk.container.privileged=true

    If you are not running rootful Podman, set the following environment
    variable:

        TESTCONTAINERS_RYUK_DISABLED=true

    Note that you will need the Podman socket to be running. On a systemd-based
    system, that command will look something like:

        # Use `enable --now` to start the socket on login and immediately.
        systemctl --user start podman.socket

    Whether you use the user service manager or the system service manager is
    up to you, but may have implications on the above ryuk related steps. Note
    that you may also need the docker(1) Podman frontend. I am unsure if the
    docker Python package supports Podman natively.
    """
    env = neon_simple_env

    endpoint = env.endpoints.create("main")
    endpoint.respec(skip_pg_catalog_updates=False)
    endpoint.start()

    if exporter == SqlExporterProcess.COMPUTE:
        stem_suffix = ""
    elif exporter == SqlExporterProcess.AUTOSCALING:
        stem_suffix = "_autoscaling"

    # Write the collector file
    collector_file = test_output_dir / f"neon_collector{stem_suffix}.yml"
    with open(collector_file, "w", encoding="utf-8") as o:
        collector = evaluate_collector(
            compute_config_dir / f"neon_collector{stem_suffix}.jsonnet", pg_version
        )
        o.write(collector)

    conn_options = endpoint.conn_options()
    pg_host = conn_options["host"]
    pg_port = conn_options["port"]
    pg_user = conn_options["user"]
    pg_dbname = conn_options["dbname"]
    pg_application_name = f"sql_exporter{stem_suffix}"
    connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}&pgaudit.log=none"

    def escape_go_filepath_match_characters(s: str) -> str:
        """
        Unfortunately sql_exporter doesn't use plain file paths, so we need to
        escape special characters. pytest encodes the parameters of a test using
        [ and ], so we need to escape them with backslashes.
        See https://pkg.go.dev/path/filepath#Match.
        """
        return s.replace("[", r"\[").replace("]", r"\]")

    # Write the config file
    config_file = test_output_dir / f"sql_exporter{stem_suffix}.yml"
    with open(config_file, "w", encoding="utf-8") as o:
        config = evaluate_config(
            compute_config_dir / "sql_exporter.jsonnet",
            collector_name=collector_file.stem,
            collector_file=escape_go_filepath_match_characters(str(collector_file))
            if SQL_EXPORTER
            else collector_file.name,
            connstr=connstr,
        )
        o.write(config)

    sql_exporter_port = port_distributor.get_port()
    with (SqlExporterNativeRunner if SQL_EXPORTER else SqlExporterContainerRunner)(
        test_output_dir, config_file, collector_file, sql_exporter_port
    ) as _runner:
        resp = requests.get(f"http://localhost:{sql_exporter_port}/metrics")
        resp.raise_for_status()


def test_perf_counters(neon_simple_env: NeonEnv):
    """
    Test compute metrics, exposed in the neon_backend_perf_counters and
    neon_perf_counters views
    """
    env = neon_simple_env
    endpoint = env.endpoints.create_start("main")

    conn = endpoint.connect()
    cur = conn.cursor()

    # We don't check that the values make sense, this is just a very
    # basic check that the server doesn't crash or something like that.
    #
    # 1.5 is the minimum version to contain these views.
    cur.execute("CREATE EXTENSION neon VERSION '1.5'")
    cur.execute("set neon.monitor_query_exec_time = on")
    cur.execute("SELECT * FROM neon_perf_counters")
    cur.execute("SELECT * FROM neon_backend_perf_counters")
    cur.execute(
        "select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()"
    )
    assert cur.fetchall()[0][0] == 2


def collect_metric(
    client: EndpointHttpClient,
    name: str,
    filter: dict[str, str],
    predicate: Callable[[list[Sample]], bool],
) -> Callable[[], list[Sample]]:
    """
    Call this function as the first argument to wait_until().
    """

    def __collect_metric() -> list[Sample]:
        resp = client.metrics()
        debug("Metrics: %s", resp)
        m = parse_metrics(resp)
        samples = m.query_all(name, filter)
        debug("Samples: %s", samples)
        assert predicate(samples), "predicate failed"
        return samples

    return __collect_metric


def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
    """
    Test that the compute_installed_extensions properly reports accurate
    results. Important to note that currently this metric is only gathered on
    compute start. We install the neon extension into a database other than
    postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during
    Postgres startup in the postgres database, creating a race condition.
    """
    DB_NAME = "test"

    env = neon_simple_env

    endpoint = env.endpoints.create_start("main")
    endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}")

    # The metric is only gathered on compute start, so restart to check that
    # plpgsql is now in 3 databases, instead of its regular 2, template1 and
    # postgres.
    endpoint.stop()
    endpoint.start()

    client = endpoint.http_client()

    def __has_plpgsql(samples: list[Sample]) -> bool:
        """
        Check that plpgsql is installed in the template1, postgres, and test
        databases
        """
        return len(samples) == 1 and samples[0].value == 3

    wait_until(
        collect_metric(
            client,
            "compute_installed_extensions",
            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
            __has_plpgsql,
        ),
        name="compute_installed_extensions",
    )

    # Install the neon extension, so we can check for it on the restart.
    endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME)

    # The metric is only gathered on compute start, so restart to check if the
    # neon extension will now be there.
    endpoint.stop()
    endpoint.start()

    client = endpoint.http_client()

    def __has_neon(samples: list[Sample]) -> bool:
        return len(samples) == 1 and samples[0].value == 1

    wait_until(
        collect_metric(
            client,
            "compute_installed_extensions",
            {"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"},
            __has_neon,
        ),
        name="compute_installed_extensions",
    )

    # Double check that we also still have plpgsql
    wait_until(
        collect_metric(
            client,
            "compute_installed_extensions",
            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
            __has_plpgsql,
        ),
        name="compute_installed_extensions",
    )