Merge branch 'main' into amasterov/random-ops-add-snapshots

# Conflicts:
#	test_runner/random_ops/test_random_ops.py
This commit is contained in:
Alexey Masterov
2025-07-23 10:58:56 +02:00
308 changed files with 13538 additions and 3075 deletions

View File

@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()

View File

@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
# Some API calls not yet implemented.
# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
@final
class NeonAPI:
def __init__(self, neon_api_key: str, neon_api_base_url: str):
self.__neon_api_key = neon_api_key
@@ -171,7 +172,7 @@ class NeonAPI:
protected: bool | None = None,
archived: bool | None = None,
init_source: str | None = None,
add_endpoint=True,
add_endpoint: bool = True,
) -> dict[str, Any]:
data: dict[str, Any] = {}
if add_endpoint:

View File

@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
):
cmd = ["storage_controller", "start"]
if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
cmd.append(f"--instance-id={instance_id}")
if base_port is not None:
cmd.append(f"--base-port={base_port}")
if handle_ps_local_disk_loss is not None:
cmd.append(
f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
)
return self.raw_cli(cmd)
def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
@@ -503,6 +508,7 @@ class NeonLocalCli(AbstractNeonCli):
pageserver_id: int | None = None,
allow_multiple=False,
update_catalog: bool = False,
privileged_role_name: str | None = None,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -534,6 +540,8 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--allow-multiple"])
if update_catalog:
args.extend(["--update-catalog"])
if privileged_role_name is not None:
args.extend(["--privileged-role-name", privileged_role_name])
res = self.raw_cli(args)
res.check_returncode()

View File

@@ -728,7 +728,7 @@ class NeonEnvBuilder:
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
# However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
# controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
# So, from_repo_dir patches up the the storcon database.
# So, from_repo_dir patches up the storcon database.
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
assert not patch_script_path.exists()
patch_script = ""
@@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert not self.running
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.running = True
return self
@@ -2119,11 +2122,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
def node_delete(self, node_id, force: bool = False):
log.info(f"node_delete({node_id})")
query = f"{self.api}/control/v1/node/{node_id}/delete"
if force:
query += "?force=true"
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/delete",
query,
headers=self.headers(TokenScope.ADMIN),
)
@@ -2835,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert instance_id is not None and base_port is not None
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.instances[instance_id] = {"running": True}
self.running = True
@@ -4118,6 +4127,294 @@ class NeonAuthBroker:
self._popen.kill()
class NeonLocalProxy(LogUtils):
"""
An object managing a local_proxy instance for rest broker testing.
The local_proxy serves as a direct connection to VanillaPostgres.
"""
def __init__(
self,
neon_binpath: Path,
test_output_dir: Path,
http_port: int,
metrics_port: int,
vanilla_pg: VanillaPostgres,
config_path: Path | None = None,
):
self.neon_binpath = neon_binpath
self.test_output_dir = test_output_dir
self.http_port = http_port
self.metrics_port = metrics_port
self.vanilla_pg = vanilla_pg
self.config_path = config_path or (test_output_dir / "local_proxy.json")
self.host = "127.0.0.1"
self.running = False
self.logfile = test_output_dir / "local_proxy.log"
self._popen: subprocess.Popen[bytes] | None = None
super().__init__(logfile=self.logfile)
def start(self) -> Self:
assert self._popen is None
assert not self.running
# Ensure vanilla_pg is running
if not self.vanilla_pg.is_running():
self.vanilla_pg.start()
args = [
str(self.neon_binpath / "local_proxy"),
"--http",
f"{self.host}:{self.http_port}",
"--metrics",
f"{self.host}:{self.metrics_port}",
"--postgres",
f"127.0.0.1:{self.vanilla_pg.default_options['port']}",
"--config-path",
str(self.config_path),
"--disable-pg-session-jwt",
]
logfile = open(self.logfile, "w")
self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
self.running = True
self._wait_until_ready()
return self
def stop(self) -> Self:
if self._popen is not None and self.running:
self._popen.terminate()
try:
self._popen.wait(timeout=5)
except subprocess.TimeoutExpired:
log.warning("failed to gracefully terminate local_proxy; killing")
self._popen.kill()
self.running = False
return self
def get_binary_version(self) -> str:
"""Get the version string of the local_proxy binary"""
try:
result = subprocess.run(
[str(self.neon_binpath / "local_proxy"), "--version"],
capture_output=True,
text=True,
timeout=10,
)
return result.stdout.strip()
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
return ""
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
def _wait_until_ready(self):
assert self._popen and self._popen.poll() is None, (
"Local proxy exited unexpectedly. Check test log."
)
requests.get(f"http://{self.host}:{self.http_port}/metrics")
def get_metrics(self) -> str:
response = requests.get(f"http://{self.host}:{self.metrics_port}/metrics")
return response.text
def assert_no_errors(self):
# Define allowed error patterns for local_proxy
allowed_errors = [
# Add patterns as needed
]
not_allowed = [
"error",
"panic",
"failed",
]
for na in not_allowed:
if na not in allowed_errors:
assert not self.log_contains(na), f"Found disallowed error pattern: {na}"
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
traceback: TracebackType | None,
):
self.stop()
class NeonRestBrokerProxy(LogUtils):
"""
An object managing a proxy instance configured as both auth broker and rest broker.
This is the main proxy binary with --is-auth-broker and --is-rest-broker flags.
"""
def __init__(
self,
neon_binpath: Path,
test_output_dir: Path,
wss_port: int,
http_port: int,
mgmt_port: int,
config_path: Path | None = None,
):
self.neon_binpath = neon_binpath
self.test_output_dir = test_output_dir
self.wss_port = wss_port
self.http_port = http_port
self.mgmt_port = mgmt_port
self.config_path = config_path or (test_output_dir / "rest_broker_proxy.json")
self.host = "127.0.0.1"
self.running = False
self.logfile = test_output_dir / "rest_broker_proxy.log"
self._popen: subprocess.Popen[Any] | None = None
def start(self) -> Self:
if self.running:
return self
# Generate self-signed TLS certificates
cert_path = self.test_output_dir / "server.crt"
key_path = self.test_output_dir / "server.key"
if not cert_path.exists() or not key_path.exists():
import subprocess
log.info("Generating self-signed TLS certificate for rest broker")
subprocess.run(
[
"openssl",
"req",
"-new",
"-x509",
"-days",
"365",
"-nodes",
"-text",
"-out",
str(cert_path),
"-keyout",
str(key_path),
"-subj",
"/CN=*.local.neon.build",
],
check=True,
)
log.info(
f"Starting rest broker proxy on WSS port {self.wss_port}, HTTP port {self.http_port}"
)
cmd = [
str(self.neon_binpath / "proxy"),
"-c",
str(cert_path),
"-k",
str(key_path),
"--is-auth-broker",
"true",
"--is-rest-broker",
"true",
"--wss",
f"{self.host}:{self.wss_port}",
"--http",
f"{self.host}:{self.http_port}",
"--mgmt",
f"{self.host}:{self.mgmt_port}",
"--auth-backend",
"local",
"--config-path",
str(self.config_path),
]
log.info(f"Starting rest broker proxy with command: {' '.join(cmd)}")
with open(self.logfile, "w") as logfile:
self._popen = subprocess.Popen(
cmd,
stdout=logfile,
stderr=subprocess.STDOUT,
cwd=self.test_output_dir,
env={
**os.environ,
"RUST_LOG": "info",
"LOGFMT": "text",
"OTEL_SDK_DISABLED": "true",
},
)
self.running = True
self._wait_until_ready()
return self
def stop(self) -> Self:
if not self.running:
return self
log.info("Stopping rest broker proxy")
if self._popen is not None:
self._popen.terminate()
try:
self._popen.wait(timeout=10)
except subprocess.TimeoutExpired:
log.warning("failed to gracefully terminate rest broker proxy; killing")
self._popen.kill()
self.running = False
return self
def get_binary_version(self) -> str:
cmd = [str(self.neon_binpath / "proxy"), "--version"]
res = subprocess.run(cmd, capture_output=True, text=True, check=True)
return res.stdout.strip()
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
def _wait_until_ready(self):
# Check if the WSS port is ready using a simple HTTPS request
# REST API is served on the WSS port with HTTPS
requests.get(f"https://{self.host}:{self.wss_port}/", timeout=1, verify=False)
# Any response (even error) means the server is up - we just need to connect
def get_metrics(self) -> str:
# Metrics are still on the HTTP port
response = requests.get(f"http://{self.host}:{self.http_port}/metrics", timeout=5)
response.raise_for_status()
return response.text
def assert_no_errors(self):
# Define allowed error patterns for rest broker proxy
allowed_errors = [
"connection closed before message completed",
"connection reset by peer",
"broken pipe",
"client disconnected",
"Authentication failed",
"connection timed out",
"no connection available",
"Pool dropped",
]
with open(self.logfile) as f:
for line in f:
if "ERROR" in line or "FATAL" in line:
if not any(allowed in line for allowed in allowed_errors):
raise AssertionError(
f"Found error in rest broker proxy log: {line.strip()}"
)
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
traceback: TracebackType | None,
):
self.stop()
@pytest.fixture(scope="function")
def link_proxy(
port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
@@ -4200,6 +4497,81 @@ def static_proxy(
yield proxy
@pytest.fixture(scope="function")
def local_proxy(
vanilla_pg: VanillaPostgres,
port_distributor: PortDistributor,
neon_binpath: Path,
test_output_dir: Path,
) -> Iterator[NeonLocalProxy]:
"""Local proxy that connects directly to vanilla postgres for rest broker testing."""
# Start vanilla_pg without database bootstrapping
vanilla_pg.start()
http_port = port_distributor.get_port()
metrics_port = port_distributor.get_port()
with NeonLocalProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
http_port=http_port,
metrics_port=metrics_port,
vanilla_pg=vanilla_pg,
) as proxy:
proxy.start()
yield proxy
@pytest.fixture(scope="function")
def local_proxy_fixed_port(
vanilla_pg: VanillaPostgres,
neon_binpath: Path,
test_output_dir: Path,
) -> Iterator[NeonLocalProxy]:
"""Local proxy that connects directly to vanilla postgres on the hardcoded port 7432."""
# Start vanilla_pg without database bootstrapping
vanilla_pg.start()
# Use the hardcoded port that the rest broker proxy expects
http_port = 7432
metrics_port = 7433 # Use a different port for metrics
with NeonLocalProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
http_port=http_port,
metrics_port=metrics_port,
vanilla_pg=vanilla_pg,
) as proxy:
proxy.start()
yield proxy
@pytest.fixture(scope="function")
def rest_broker_proxy(
port_distributor: PortDistributor,
neon_binpath: Path,
test_output_dir: Path,
) -> Iterator[NeonRestBrokerProxy]:
"""Rest broker proxy that handles both auth broker and rest broker functionality."""
wss_port = port_distributor.get_port()
http_port = port_distributor.get_port()
mgmt_port = port_distributor.get_port()
with NeonRestBrokerProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
wss_port=wss_port,
http_port=http_port,
mgmt_port=mgmt_port,
) as proxy:
proxy.start()
yield proxy
@pytest.fixture(scope="function")
def neon_authorize_jwk() -> jwk.JWK:
kid = str(uuid.uuid4())
@@ -4324,6 +4696,7 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id: int | None = None,
allow_multiple: bool = False,
update_catalog: bool = False,
privileged_role_name: str | None = None,
) -> Self:
"""
Create a new Postgres endpoint.
@@ -4351,6 +4724,7 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
update_catalog=update_catalog,
privileged_role_name=privileged_role_name,
)
path = Path("endpoints") / self.endpoint_id / "pgdata"
self.pgdata_dir = self.env.repo_dir / path
@@ -4800,6 +5174,7 @@ class EndpointFactory:
config_lines: list[str] | None = None,
pageserver_id: int | None = None,
update_catalog: bool = False,
privileged_role_name: str | None = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4823,6 +5198,7 @@ class EndpointFactory:
config_lines=config_lines,
pageserver_id=pageserver_id,
update_catalog=update_catalog,
privileged_role_name=privileged_role_name,
)
def stop_all(self, fail_on_error=True) -> Self:
@@ -5417,6 +5793,7 @@ SKIP_FILES = frozenset(
"postmaster.pid",
"pg_control",
"pg_dynshmem",
"neon-communicator.socket",
)
)

View File

@@ -152,6 +152,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
".*reconciler.*neon_local error.*",
# Tenant rate limits may fire in tests that submit lots of API requests.
".*tenant \\S+ is rate limited.*",
# Reconciliations may get stuck/delayed e.g. in chaos tests.
".*background_reconcile: Shard reconciliation is stuck.*",
]

View File

@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
return res_json
def timeline_lsn_lease(
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
):
data = {
"lsn": str(lsn),
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
json=data,
**kwargs,
)
self.verbose_error(res)
res_json = res.json()

View File

@@ -741,3 +741,29 @@ def shared_buffers_for_max_cu(max_cu: float) -> str:
sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024))
sharedBuffers = int(sharedBuffersMb * 1024 / 8)
return str(sharedBuffers)
def skip_if_proxy_lacks_rest_broker(reason: str = "proxy was built without 'rest_broker' feature"):
# Determine the binary path using the same logic as neon_binpath fixture
def has_rest_broker_feature():
# Find the neon binaries
if env_neon_bin := os.environ.get("NEON_BIN"):
binpath = Path(env_neon_bin)
else:
base_dir = Path(__file__).parents[2] # Same as BASE_DIR in paths.py
build_type = os.environ.get("BUILD_TYPE", "debug")
binpath = base_dir / "target" / build_type
proxy_bin = binpath / "proxy"
if not proxy_bin.exists():
return False
try:
cmd = [str(proxy_bin), "--help"]
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10)
help_output = result.stdout
return "--is-rest-broker" in help_output
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
return False
return pytest.mark.skipif(not has_rest_broker_feature(), reason=reason)