Introduce built-in Prometheus exporter to the Postgres extension (#12591)

Currently, the exporter exposes the same LFC metrics that are exposed by
the "autoscaling" sql_exporter in the docker image. With this, we can
remove the dedicated sql_exporter instance. (Actually doing the removal
is left as a TODO until this is rolled out to production and we have
changed autoscaling-agent to fetch the metrics from this new endpoint.)

The exporter runs as a Postgres background worker process. This is
extracted from the Rust communicator rewrite project, which will use the
same worker process for much more, to handle the communications with the
pageservers. For now, though, it merely handles the metrics requests.

In the future, we will add more metrics, and perhaps even APIs to
control the running Postgres instance.

The exporter listens on a Unix Domain socket within the Postgres data
directory. A Unix Domain socket is a bit unconventional, but it has some
advantages:

- Permissions are taken care of. Only processes that can access the data
directory, and therefore already have full access to the running
Postgres instance, can connect to it.

- No need to allocate and manage a new port number for the listener

It has some downsides too: it's not immediately accessible from the
outside world, and the functions to work with Unix Domain sockets are
more low-level than TCP sockets (see the symlink hack in
`postgres_metrics_client.rs`, for example).

To expose the metrics from the local Unix Domain Socket to the
autoscaling agent, introduce a new '/autoscaling_metrics' endpoint in
the compute_ctl's HTTP server. Currently it merely forwards the request
to the Postgres instance, but we could add rate limiting and access
control there in the future.

---------

Co-authored-by: Conrad Ludgate <conrad@neon.tech>
This commit is contained in:
Heikki Linnakangas
2025-07-22 15:00:20 +03:00
committed by GitHub
parent 88bc06f148
commit 8bb45fd5da
28 changed files with 1256 additions and 26 deletions

View File

@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()

View File

@@ -5793,6 +5793,7 @@ SKIP_FILES = frozenset(
"postmaster.pid",
"pg_control",
"pg_dynshmem",
"neon-communicator.socket",
)
)

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
import requests
import requests_unixsocket # type: ignore [import-untyped]
from fixtures.metrics import parse_metrics
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
def test_communicator_metrics(neon_simple_env: NeonEnv):
"""
Test the communicator's built-in HTTP prometheus exporter
"""
env = neon_simple_env
endpoint = env.endpoints.create("main")
endpoint.start()
# Change current directory to the data directory, so that we can use
# a short relative path to refer to the socket. (There's a 100 char
# limitation on the path.)
os.chdir(str(endpoint.pgdata_dir))
session = requests_unixsocket.Session()
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
# quick test that the endpoint returned something expected. (We don't validate
# that the metrics returned are sensible.)
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")
# Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
# expected to unwind and drop the HTTP connection without response, but not
# kill the process or the server.
with pytest.raises(
requests.ConnectionError, match="Remote end closed connection without response"
):
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
assert r.status_code == 500
# Test that subsequent requests after the panic still work.
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")

View File

@@ -197,7 +197,7 @@ def test_create_snapshot(
shutil.copytree(
test_output_dir,
new_compatibility_snapshot_dir,
ignore=shutil.ignore_patterns("pg_dynshmem"),
ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
)
log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")

View File

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.utils import USE_LFC, query_scalar
if TYPE_CHECKING:
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
blocks = query_scalar(cur, "select approximate_working_set_size(false)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 20
# Also test the metrics from the /autoscaling_metrics endpoint
autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
log.debug(f"Raw metrics: {autoscaling_metrics}")
m = parse_metrics(autoscaling_metrics)
http_estimate = m.query_one(
"lfc_approximate_working_set_size_windows",
{
"duration_seconds": "60",
},
).value
log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
assert http_estimate > 0 and http_estimate < 20
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):