Files
neon/test_runner/fixtures/endpoint/http.py
Heikki Linnakangas 8bb45fd5da Introduce built-in Prometheus exporter to the Postgres extension (#12591)
Currently, the exporter exposes the same LFC metrics that are exposed by
the "autoscaling" sql_exporter in the docker image. With this, we can
remove the dedicated sql_exporter instance. (Actually doing the removal
is left as a TODO until this is rolled out to production and we have
changed autoscaling-agent to fetch the metrics from this new endpoint.)

The exporter runs as a Postgres background worker process. This is
extracted from the Rust communicator rewrite project, which will use the
same worker process for much more, to handle the communications with the
pageservers. For now, though, it merely handles the metrics requests.

In the future, we will add more metrics, and perhaps even APIs to
control the running Postgres instance.

The exporter listens on a Unix Domain socket within the Postgres data
directory. A Unix Domain socket is a bit unconventional, but it has some
advantages:

- Permissions are taken care of. Only processes that can access the data
directory, and therefore already have full access to the running
Postgres instance, can connect to it.

- No need to allocate and manage a new port number for the listener

It has some downsides too: it's not immediately accessible from the
outside world, and the functions to work with Unix Domain sockets are
more low-level than TCP sockets (see the symlink hack in
`postgres_metrics_client.rs`, for example).

To expose the metrics from the local Unix Domain Socket to the
autoscaling agent, introduce a new '/autoscaling_metrics' endpoint in
the compute_ctl's HTTP server. Currently it merely forwards the request
to the Postgres instance, but we could add rate limiting and access
control there in the future.

---------

Co-authored-by: Conrad Ludgate <conrad@neon.tech>
2025-07-22 12:00:20 +00:00

181 lines
5.6 KiB
Python

from __future__ import annotations
import urllib.parse
from enum import StrEnum
from typing import TYPE_CHECKING, Any, final
import requests
from requests.adapters import HTTPAdapter
from requests.auth import AuthBase
from requests.exceptions import ReadTimeout
from typing_extensions import override
from fixtures.log_helper import log
from fixtures.utils import wait_until
if TYPE_CHECKING:
from requests import PreparedRequest
COMPUTE_AUDIENCE = "compute"
"""
The value to place in the `aud` claim.
"""
@final
class ComputeClaimsScope(StrEnum):
ADMIN = "compute_ctl:admin"
@final
class BearerAuth(AuthBase):
"""
Auth implementation for bearer authorization in HTTP requests through the
requests HTTP client library.
"""
def __init__(self, jwt: str):
self.__jwt = jwt
@override
def __call__(self, request: PreparedRequest) -> PreparedRequest:
request.headers["Authorization"] = "Bearer " + self.__jwt
return request
@final
class EndpointHttpClient(requests.Session):
def __init__(
self,
external_port: int,
internal_port: int,
jwt: str,
):
super().__init__()
self.external_port: int = external_port
self.internal_port: int = internal_port
self.auth = BearerAuth(jwt)
self.mount("http://", HTTPAdapter())
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
def dbs_and_roles(self):
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self, from_endpoint_id: str | None = None):
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(self.prewarm_url, params=params).raise_for_status()
self.prewarm_lfc_wait()
def prewarm_lfc_wait(self):
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, {err=}"
wait_until(prewarmed, timeout=60)
def offload_lfc_status(self) -> dict[str, str]:
res = self.get(self.offload_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def offload_lfc(self):
self.post(self.offload_url).raise_for_status()
self.offload_lfc_wait()
def offload_lfc_wait(self):
def offloaded():
json = self.offload_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, {err=}"
wait_until(offloaded)
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
url = f"http://localhost:{self.external_port}/promote"
if disconnect:
try: # send first request to start promote and disconnect
self.post(url, data=safekeepers_lsn, timeout=0.001)
except ReadTimeout:
pass # wait on second request which returns on promotion finish
res = self.post(url, data=safekeepers_lsn)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def database_schema(self, database: str):
res = self.get(
f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
auth=self.auth,
)
res.raise_for_status()
return res.text
def extensions(self, extension: str, version: str, database: str):
body = {
"extension": extension,
"version": version,
"database": database,
}
res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body)
res.raise_for_status()
return res.json()
def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
res = self.post(
f"http://localhost:{self.internal_port}/grants",
json={"database": database, "schema": schema, "role": role, "privileges": privileges},
)
res.raise_for_status()
return res.json()
def metrics(self) -> str:
res = self.get(f"http://localhost:{self.external_port}/metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
# Current compute status.
def status(self):
res = self.get(f"http://localhost:{self.external_port}/status", auth=self.auth)
res.raise_for_status()
return res.json()
# Compute startup-related metrics.
def metrics_json(self):
res = self.get(f"http://localhost:{self.external_port}/metrics.json", auth=self.auth)
res.raise_for_status()
return res.json()
def configure_failpoints(self, *args: tuple[str, str]) -> None:
body: list[dict[str, str]] = []
for fp in args:
body.append(
{
"name": fp[0],
"action": fp[1],
}
)
res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body)
res.raise_for_status()