Files
neon/test_runner/fixtures/endpoint/http.py
Heikki Linnakangas 70fdd75c89 Introduce built-in Prometheus exporter to the Postgres extension
Currently, the exporter exposes the same LFC metrics that are exposed
by the "autoscaling" sql_exporter in the docker image. With this, we
can remove the dedicated sql_exporter instance. But that's left as a
TODO until this is rolled out to production and we have changed
autoscaling-agent to fetch the metrics from this new endpoint.

The exporter runs as a Postgres background worker process. This is
extracted from the Rust communicator rewrite project, which will use
the same worker process for much more, to handle the communications
with the pageservers. For now, though, it merely handles the metrics
requests.

In the future, we will add more metrics, and perhaps even APIs to
control the running Postgres instance.

The exporter listens on a Unix Domain socket within the Postgres data
directory. A Unix Domain socket is a bit inconventional, but it has
some advantages:

- Permissions are taken care of. Only processes that can access the
  data directory, and therefore already have full access to the
  running Postgres instance, can connect to it.

- No need to allocate and manage a new port number for the listener

It has some downsides too: it's not immediately accessible from the
outside world, and the functions to work with Unix Domain sockets are
more low-level than TCP sockets (see the symlink hack in
`postgres_metrics_client.rs`, for example).

To expose the metrics from the local Unix Domain Socket to the
autoscaling agent, introduce a new '/autoscaling_metrics' endpoint in
the compute_ctl's HTTP server. Currently it merely forwards the
request to the Postgres instance, but we could add rate limiting and
access control there in the future.
2025-07-15 11:50:31 +03:00

181 lines
5.6 KiB
Python

from __future__ import annotations
import urllib.parse
from enum import StrEnum
from typing import TYPE_CHECKING, Any, final
import requests
from requests.adapters import HTTPAdapter
from requests.auth import AuthBase
from requests.exceptions import ReadTimeout
from typing_extensions import override
from fixtures.log_helper import log
from fixtures.utils import wait_until
if TYPE_CHECKING:
from requests import PreparedRequest
COMPUTE_AUDIENCE = "compute"
"""
The value to place in the `aud` claim.
"""
@final
class ComputeClaimsScope(StrEnum):
ADMIN = "compute_ctl:admin"
@final
class BearerAuth(AuthBase):
"""
Auth implementation for bearer authorization in HTTP requests through the
requests HTTP client library.
"""
def __init__(self, jwt: str):
self.__jwt = jwt
@override
def __call__(self, request: PreparedRequest) -> PreparedRequest:
request.headers["Authorization"] = "Bearer " + self.__jwt
return request
@final
class EndpointHttpClient(requests.Session):
def __init__(
self,
external_port: int,
internal_port: int,
jwt: str,
):
super().__init__()
self.external_port: int = external_port
self.internal_port: int = internal_port
self.auth = BearerAuth(jwt)
self.mount("http://", HTTPAdapter())
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
def dbs_and_roles(self):
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self, from_endpoint_id: str | None = None):
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(self.prewarm_url, params=params).raise_for_status()
self.prewarm_lfc_wait()
def prewarm_lfc_wait(self):
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, {err=}"
wait_until(prewarmed, timeout=60)
def offload_lfc_status(self) -> dict[str, str]:
res = self.get(self.offload_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def offload_lfc(self):
self.post(self.offload_url).raise_for_status()
self.offload_lfc_wait()
def offload_lfc_wait(self):
def offloaded():
json = self.offload_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, {err=}"
wait_until(offloaded)
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
url = f"http://localhost:{self.external_port}/promote"
if disconnect:
try: # send first request to start promote and disconnect
self.post(url, data=safekeepers_lsn, timeout=0.001)
except ReadTimeout:
pass # wait on second request which returns on promotion finish
res = self.post(url, data=safekeepers_lsn)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def database_schema(self, database: str):
res = self.get(
f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
auth=self.auth,
)
res.raise_for_status()
return res.text
def extensions(self, extension: str, version: str, database: str):
body = {
"extension": extension,
"version": version,
"database": database,
}
res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body)
res.raise_for_status()
return res.json()
def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
res = self.post(
f"http://localhost:{self.internal_port}/grants",
json={"database": database, "schema": schema, "role": role, "privileges": privileges},
)
res.raise_for_status()
return res.json()
def metrics(self) -> str:
res = self.get(f"http://localhost:{self.external_port}/metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
# Current compute status.
def status(self):
res = self.get(f"http://localhost:{self.external_port}/status", auth=self.auth)
res.raise_for_status()
return res.json()
# Compute startup-related metrics.
def metrics_json(self):
res = self.get(f"http://localhost:{self.external_port}/metrics.json", auth=self.auth)
res.raise_for_status()
return res.json()
def configure_failpoints(self, *args: tuple[str, str]) -> None:
body: list[dict[str, str]] = []
for fp in args:
body.append(
{
"name": fp[0],
"action": fp[1],
}
)
res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body)
res.raise_for_status()