Merge branch 'main' into amasterov/random-ops-add-snapshots

# Conflicts: # test_runner/random_ops/test_random_ops.py
2026-01-08 14:02:55 +00:00 · 2025-07-23 10:58:56 +02:00
parent 8edea1dea3 fc242afcc2
commit 556e9cb781
308 changed files with 13538 additions and 3075 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
        res.raise_for_status()
        return res.json()

+    def autoscaling_metrics(self):
+        res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
+        res.raise_for_status()
+        log.debug("raw compute metrics: %s", res.text)
+        return res.text
+
    def prewarm_lfc_status(self) -> dict[str, str]:
        res = self.get(self.prewarm_url)
        res.raise_for_status()
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:

 # Some API calls not yet implemented.
 # You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
+@final
 class NeonAPI:
    def __init__(self, neon_api_key: str, neon_api_base_url: str):
        self.__neon_api_key = neon_api_key
@@ -171,7 +172,7 @@ class NeonAPI:
        protected: bool | None = None,
        archived: bool | None = None,
        init_source: str | None = None,
-        add_endpoint=True,
+        add_endpoint: bool = True,
    ) -> dict[str, Any]:
        data: dict[str, Any] = {}
        if add_endpoint:
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ):
        cmd = ["storage_controller", "start"]
        if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
            cmd.append(f"--instance-id={instance_id}")
        if base_port is not None:
            cmd.append(f"--base-port={base_port}")
+        if handle_ps_local_disk_loss is not None:
+            cmd.append(
+                f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
+            )
        return self.raw_cli(cmd)

    def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
@@ -503,6 +508,7 @@ class NeonLocalCli(AbstractNeonCli):
        pageserver_id: int | None = None,
        allow_multiple=False,
        update_catalog: bool = False,
+        privileged_role_name: str | None = None,
    ) -> subprocess.CompletedProcess[str]:
        args = [
            "endpoint",
@@ -534,6 +540,8 @@ class NeonLocalCli(AbstractNeonCli):
            args.extend(["--allow-multiple"])
        if update_catalog:
            args.extend(["--update-catalog"])
+        if privileged_role_name is not None:
+            args.extend(["--privileged-role-name", privileged_role_name])

        res = self.raw_cli(args)
        res.check_returncode()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -728,7 +728,7 @@ class NeonEnvBuilder:
        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
        # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
        # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
-        # So, from_repo_dir patches up the the storcon database.
+        # So, from_repo_dir patches up the storcon database.
        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
        assert not patch_script_path.exists()
        patch_script = ""
@@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ) -> Self:
        assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
        self.running = True
        return self

@@ -2119,11 +2122,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def node_delete(self, node_id):
+    def node_delete(self, node_id, force: bool = False):
        log.info(f"node_delete({node_id})")
+        query = f"{self.api}/control/v1/node/{node_id}/delete"
+        if force:
+            query += "?force=true"
        self.request(
            "PUT",
-            f"{self.api}/control/v1/node/{node_id}/delete",
+            query,
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2835,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ) -> Self:
        assert instance_id is not None and base_port is not None

-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
        self.instances[instance_id] = {"running": True}

        self.running = True
@@ -4118,6 +4127,294 @@ class NeonAuthBroker:
                self._popen.kill()


+class NeonLocalProxy(LogUtils):
+    """
+    An object managing a local_proxy instance for rest broker testing.
+    The local_proxy serves as a direct connection to VanillaPostgres.
+    """
+
+    def __init__(
+        self,
+        neon_binpath: Path,
+        test_output_dir: Path,
+        http_port: int,
+        metrics_port: int,
+        vanilla_pg: VanillaPostgres,
+        config_path: Path | None = None,
+    ):
+        self.neon_binpath = neon_binpath
+        self.test_output_dir = test_output_dir
+        self.http_port = http_port
+        self.metrics_port = metrics_port
+        self.vanilla_pg = vanilla_pg
+        self.config_path = config_path or (test_output_dir / "local_proxy.json")
+        self.host = "127.0.0.1"
+        self.running = False
+        self.logfile = test_output_dir / "local_proxy.log"
+        self._popen: subprocess.Popen[bytes] | None = None
+        super().__init__(logfile=self.logfile)
+
+    def start(self) -> Self:
+        assert self._popen is None
+        assert not self.running
+
+        # Ensure vanilla_pg is running
+        if not self.vanilla_pg.is_running():
+            self.vanilla_pg.start()
+
+        args = [
+            str(self.neon_binpath / "local_proxy"),
+            "--http",
+            f"{self.host}:{self.http_port}",
+            "--metrics",
+            f"{self.host}:{self.metrics_port}",
+            "--postgres",
+            f"127.0.0.1:{self.vanilla_pg.default_options['port']}",
+            "--config-path",
+            str(self.config_path),
+            "--disable-pg-session-jwt",
+        ]
+
+        logfile = open(self.logfile, "w")
+        self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
+        self.running = True
+        self._wait_until_ready()
+        return self
+
+    def stop(self) -> Self:
+        if self._popen is not None and self.running:
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate local_proxy; killing")
+                self._popen.kill()
+            self.running = False
+        return self
+
+    def get_binary_version(self) -> str:
+        """Get the version string of the local_proxy binary"""
+        try:
+            result = subprocess.run(
+                [str(self.neon_binpath / "local_proxy"), "--version"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            return result.stdout.strip()
+        except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
+            return ""
+
+    @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
+    def _wait_until_ready(self):
+        assert self._popen and self._popen.poll() is None, (
+            "Local proxy exited unexpectedly. Check test log."
+        )
+        requests.get(f"http://{self.host}:{self.http_port}/metrics")
+
+    def get_metrics(self) -> str:
+        response = requests.get(f"http://{self.host}:{self.metrics_port}/metrics")
+        return response.text
+
+    def assert_no_errors(self):
+        # Define allowed error patterns for local_proxy
+        allowed_errors = [
+            # Add patterns as needed
+        ]
+        not_allowed = [
+            "error",
+            "panic",
+            "failed",
+        ]
+
+        for na in not_allowed:
+            if na not in allowed_errors:
+                assert not self.log_contains(na), f"Found disallowed error pattern: {na}"
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ):
+        self.stop()
+
+
+class NeonRestBrokerProxy(LogUtils):
+    """
+    An object managing a proxy instance configured as both auth broker and rest broker.
+    This is the main proxy binary with --is-auth-broker and --is-rest-broker flags.
+    """
+
+    def __init__(
+        self,
+        neon_binpath: Path,
+        test_output_dir: Path,
+        wss_port: int,
+        http_port: int,
+        mgmt_port: int,
+        config_path: Path | None = None,
+    ):
+        self.neon_binpath = neon_binpath
+        self.test_output_dir = test_output_dir
+        self.wss_port = wss_port
+        self.http_port = http_port
+        self.mgmt_port = mgmt_port
+        self.config_path = config_path or (test_output_dir / "rest_broker_proxy.json")
+        self.host = "127.0.0.1"
+        self.running = False
+        self.logfile = test_output_dir / "rest_broker_proxy.log"
+        self._popen: subprocess.Popen[Any] | None = None
+
+    def start(self) -> Self:
+        if self.running:
+            return self
+
+        # Generate self-signed TLS certificates
+        cert_path = self.test_output_dir / "server.crt"
+        key_path = self.test_output_dir / "server.key"
+
+        if not cert_path.exists() or not key_path.exists():
+            import subprocess
+
+            log.info("Generating self-signed TLS certificate for rest broker")
+            subprocess.run(
+                [
+                    "openssl",
+                    "req",
+                    "-new",
+                    "-x509",
+                    "-days",
+                    "365",
+                    "-nodes",
+                    "-text",
+                    "-out",
+                    str(cert_path),
+                    "-keyout",
+                    str(key_path),
+                    "-subj",
+                    "/CN=*.local.neon.build",
+                ],
+                check=True,
+            )
+
+        log.info(
+            f"Starting rest broker proxy on WSS port {self.wss_port}, HTTP port {self.http_port}"
+        )
+
+        cmd = [
+            str(self.neon_binpath / "proxy"),
+            "-c",
+            str(cert_path),
+            "-k",
+            str(key_path),
+            "--is-auth-broker",
+            "true",
+            "--is-rest-broker",
+            "true",
+            "--wss",
+            f"{self.host}:{self.wss_port}",
+            "--http",
+            f"{self.host}:{self.http_port}",
+            "--mgmt",
+            f"{self.host}:{self.mgmt_port}",
+            "--auth-backend",
+            "local",
+            "--config-path",
+            str(self.config_path),
+        ]
+
+        log.info(f"Starting rest broker proxy with command: {' '.join(cmd)}")
+
+        with open(self.logfile, "w") as logfile:
+            self._popen = subprocess.Popen(
+                cmd,
+                stdout=logfile,
+                stderr=subprocess.STDOUT,
+                cwd=self.test_output_dir,
+                env={
+                    **os.environ,
+                    "RUST_LOG": "info",
+                    "LOGFMT": "text",
+                    "OTEL_SDK_DISABLED": "true",
+                },
+            )
+
+        self.running = True
+        self._wait_until_ready()
+        return self
+
+    def stop(self) -> Self:
+        if not self.running:
+            return self
+
+        log.info("Stopping rest broker proxy")
+
+        if self._popen is not None:
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate rest broker proxy; killing")
+                self._popen.kill()
+
+        self.running = False
+        return self
+
+    def get_binary_version(self) -> str:
+        cmd = [str(self.neon_binpath / "proxy"), "--version"]
+        res = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return res.stdout.strip()
+
+    @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
+    def _wait_until_ready(self):
+        # Check if the WSS port is ready using a simple HTTPS request
+        # REST API is served on the WSS port with HTTPS
+        requests.get(f"https://{self.host}:{self.wss_port}/", timeout=1, verify=False)
+        # Any response (even error) means the server is up - we just need to connect
+
+    def get_metrics(self) -> str:
+        # Metrics are still on the HTTP port
+        response = requests.get(f"http://{self.host}:{self.http_port}/metrics", timeout=5)
+        response.raise_for_status()
+        return response.text
+
+    def assert_no_errors(self):
+        # Define allowed error patterns for rest broker proxy
+        allowed_errors = [
+            "connection closed before message completed",
+            "connection reset by peer",
+            "broken pipe",
+            "client disconnected",
+            "Authentication failed",
+            "connection timed out",
+            "no connection available",
+            "Pool dropped",
+        ]
+
+        with open(self.logfile) as f:
+            for line in f:
+                if "ERROR" in line or "FATAL" in line:
+                    if not any(allowed in line for allowed in allowed_errors):
+                        raise AssertionError(
+                            f"Found error in rest broker proxy log: {line.strip()}"
+                        )
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ):
+        self.stop()
+
+
@pytest.fixture(scope="function")
 def link_proxy(
    port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
@@ -4200,6 +4497,81 @@ def static_proxy(
        yield proxy


+@pytest.fixture(scope="function")
+def local_proxy(
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
+) -> Iterator[NeonLocalProxy]:
+    """Local proxy that connects directly to vanilla postgres for rest broker testing."""
+
+    # Start vanilla_pg without database bootstrapping
+    vanilla_pg.start()
+
+    http_port = port_distributor.get_port()
+    metrics_port = port_distributor.get_port()
+
+    with NeonLocalProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        http_port=http_port,
+        metrics_port=metrics_port,
+        vanilla_pg=vanilla_pg,
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.fixture(scope="function")
+def local_proxy_fixed_port(
+    vanilla_pg: VanillaPostgres,
+    neon_binpath: Path,
+    test_output_dir: Path,
+) -> Iterator[NeonLocalProxy]:
+    """Local proxy that connects directly to vanilla postgres on the hardcoded port 7432."""
+
+    # Start vanilla_pg without database bootstrapping
+    vanilla_pg.start()
+
+    # Use the hardcoded port that the rest broker proxy expects
+    http_port = 7432
+    metrics_port = 7433  # Use a different port for metrics
+
+    with NeonLocalProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        http_port=http_port,
+        metrics_port=metrics_port,
+        vanilla_pg=vanilla_pg,
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.fixture(scope="function")
+def rest_broker_proxy(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
+) -> Iterator[NeonRestBrokerProxy]:
+    """Rest broker proxy that handles both auth broker and rest broker functionality."""
+
+    wss_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
+
+    with NeonRestBrokerProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        wss_port=wss_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
@pytest.fixture(scope="function")
 def neon_authorize_jwk() -> jwk.JWK:
    kid = str(uuid.uuid4())
@@ -4324,6 +4696,7 @@ class Endpoint(PgProtocol, LogUtils):
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
        update_catalog: bool = False,
+        privileged_role_name: str | None = None,
    ) -> Self:
        """
        Create a new Postgres endpoint.
@@ -4351,6 +4724,7 @@ class Endpoint(PgProtocol, LogUtils):
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            update_catalog=update_catalog,
+            privileged_role_name=privileged_role_name,
        )
        path = Path("endpoints") / self.endpoint_id / "pgdata"
        self.pgdata_dir = self.env.repo_dir / path
@@ -4800,6 +5174,7 @@ class EndpointFactory:
        config_lines: list[str] | None = None,
        pageserver_id: int | None = None,
        update_catalog: bool = False,
+        privileged_role_name: str | None = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -4823,6 +5198,7 @@ class EndpointFactory:
            config_lines=config_lines,
            pageserver_id=pageserver_id,
            update_catalog=update_catalog,
+            privileged_role_name=privileged_role_name,
        )

    def stop_all(self, fail_on_error=True) -> Self:
@@ -5417,6 +5793,7 @@ SKIP_FILES = frozenset(
        "postmaster.pid",
        "pg_control",
        "pg_dynshmem",
+        "neon-communicator.socket",
    )
 )

--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -152,6 +152,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    ".*reconciler.*neon_local error.*",
    # Tenant rate limits may fire in tests that submit lots of API requests.
    ".*tenant \\S+ is rate limited.*",
+    # Reconciliations may get stuck/delayed e.g. in chaos tests.
+    ".*background_reconcile: Shard reconciliation is stuck.*",
 ]


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        return res_json

    def timeline_lsn_lease(
-        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
    ):
        data = {
            "lsn": str(lsn),
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res = self.post(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
            json=data,
+            **kwargs,
        )
        self.verbose_error(res)
        res_json = res.json()
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -741,3 +741,29 @@ def shared_buffers_for_max_cu(max_cu: float) -> str:
    sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024))
    sharedBuffers = int(sharedBuffersMb * 1024 / 8)
    return str(sharedBuffers)
+
+
+def skip_if_proxy_lacks_rest_broker(reason: str = "proxy was built without 'rest_broker' feature"):
+    # Determine the binary path using the same logic as neon_binpath fixture
+    def has_rest_broker_feature():
+        # Find the neon binaries
+        if env_neon_bin := os.environ.get("NEON_BIN"):
+            binpath = Path(env_neon_bin)
+        else:
+            base_dir = Path(__file__).parents[2]  # Same as BASE_DIR in paths.py
+            build_type = os.environ.get("BUILD_TYPE", "debug")
+            binpath = base_dir / "target" / build_type
+
+        proxy_bin = binpath / "proxy"
+        if not proxy_bin.exists():
+            return False
+
+        try:
+            cmd = [str(proxy_bin), "--help"]
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10)
+            help_output = result.stdout
+            return "--is-rest-broker" in help_output
+        except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
+            return False
+
+    return pytest.mark.skipif(not has_rest_broker_feature(), reason=reason)
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            ".*Local notification hook failed.*",
            ".*Marking shard.*for notification retry.*",
            ".*Failed to notify compute.*",
+            # As an optimization, the storage controller kicks the downloads on the secondary
+            # after the shard split. However, secondaries are created async, so it's possible
+            # that the intent state was modified, but the actual secondary hasn't been created,
+            # which results in an error.
+            ".*Error calling secondary download after shard split.*",
        ]
    )

--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -711,6 +711,9 @@ def test_api_random(
    # To not go to the past where pgbench tables do not exist
    time.sleep(1)
    project.min_time = datetime.now(UTC)
+    # To not go to the past where pgbench tables do not exist
+    time.sleep(1)
+    project.min_time = datetime.now(UTC)
    for _ in range(num_operations):
        log.info("Starting action #%s", _ + 1)
        while not do_action(
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -24,10 +24,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
        [
            ".*get_values_reconstruct_data for layer .*",
            ".*could not find data for key.*",
-            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
-            ".*failed to load metadata.*",
-            ".*load failed.*load local timeline.*",
            ".*: layer load failed, assuming permanent failure:.*",
            ".*failed to get checkpoint bytes.*",
            ".*failed to get control bytes.*",
--- a/test_runner/regress/test_communicator_metrics_exporter.py
+++ b/test_runner/regress/test_communicator_metrics_exporter.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import pytest
+import requests
+import requests_unixsocket  # type: ignore [import-untyped]
+from fixtures.metrics import parse_metrics
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
+
+
+def test_communicator_metrics(neon_simple_env: NeonEnv):
+    """
+    Test the communicator's built-in HTTP prometheus exporter
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.start()
+
+    # Change current directory to the data directory, so that we can use
+    # a short relative path to refer to the socket. (There's a 100 char
+    # limitation on the path.)
+    os.chdir(str(endpoint.pgdata_dir))
+    session = requests_unixsocket.Session()
+    r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
+    assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
+
+    # quick test that the endpoint returned something expected. (We don't validate
+    # that the metrics returned are sensible.)
+    m = parse_metrics(r.text)
+    m.query_one("lfc_hits")
+    m.query_one("lfc_misses")
+
+    # Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
+    # expected to unwind and drop the HTTP connection without response, but not
+    # kill the process or the server.
+    with pytest.raises(
+        requests.ConnectionError, match="Remote end closed connection without response"
+    ):
+        r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
+        assert r.status_code == 500
+
+    # Test that subsequent requests after the panic still work.
+    r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
+    assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
+    m = parse_metrics(r.text)
+    m.query_one("lfc_hits")
+    m.query_one("lfc_misses")
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -687,7 +687,7 @@ def test_sharding_compaction(
    for _i in range(0, 10):
        # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
        # these should result in image layers each time we write some data into a shard, and also shards
-        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
+        # receiving less data hitting their "empty image layer" path (where they should skip writing the layer,
        # rather than asserting)
        workload.churn_rows(64)

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -187,19 +187,21 @@ def test_create_snapshot(
    env.pageserver.stop()
    env.storage_controller.stop()

-    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
-    compatibility_snapshot_dir = (
+    # Directory `new_compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    new_compatibility_snapshot_dir = (
        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
    )
-    if compatibility_snapshot_dir.exists():
-        shutil.rmtree(compatibility_snapshot_dir)
+    if new_compatibility_snapshot_dir.exists():
+        shutil.rmtree(new_compatibility_snapshot_dir)

    shutil.copytree(
        test_output_dir,
-        compatibility_snapshot_dir,
-        ignore=shutil.ignore_patterns("pg_dynshmem"),
+        new_compatibility_snapshot_dir,
+        ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
    )

+    log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
+

 # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
 ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
@@ -218,6 +220,7 @@ def test_backward_compatibility(
    """
    Test that the new binaries can read old data
    """
+    log.info(f"Using snapshot dir at {compatibility_snapshot_dir}")
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
    env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -242,7 +245,6 @@ def test_forward_compatibility(
    test_output_dir: Path,
    top_output_dir: Path,
    pg_version: PgVersion,
-    compatibility_snapshot_dir: Path,
    compute_reconfigure_listener: ComputeReconfigure,
 ):
    """
@@ -266,8 +268,14 @@ def test_forward_compatibility(
    neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
    neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir

+    # Note that we are testing with new data, so we should use `new_compatibility_snapshot_dir`, which is created by test_create_snapshot.
+    new_compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
+
+    log.info(f"Using snapshot dir at {new_compatibility_snapshot_dir}")
    env = neon_env_builder.from_repo_dir(
-        compatibility_snapshot_dir / "repo",
+        new_compatibility_snapshot_dir / "repo",
    )
    # there may be an arbitrary number of unrelated tests run between create_snapshot and here
    env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -296,7 +304,7 @@ def test_forward_compatibility(
    check_neon_works(
        env,
        test_output_dir=test_output_dir,
-        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        sql_dump_path=new_compatibility_snapshot_dir / "dump.sql",
        repo_dir=env.repo_dir,
    )

--- a/test_runner/regress/test_feature_flag.py
+++ b/test_runner/regress/test_feature_flag.py
@@ -50,11 +50,15 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder):
        )["result"]
    )

+    env.endpoints.create_start("main")  # trigger basebackup
    env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant)

    # Check if the properties exist
    result = env.pageserver.http_client().evaluate_feature_flag_multivariate(
        env.initial_tenant, "test-feature-flag"
    )
+
    assert "tenant_remote_size_mb" in result["properties"]
+    assert "tenant_db_count_max" in result["properties"]
+    assert "tenant_rel_count_max" in result["properties"]
    assert "tenant_id" in result["properties"]
--- a/test_runner/regress/test_hcc_handling_ps_data_loss.py
+++ b/test_runner/regress/test_hcc_handling_ps_data_loss.py
@@ -0,0 +1,47 @@
+import shutil
+
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import query_scalar
+
+
+def test_hcc_handling_ps_data_loss(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that following a pageserver local data loss event, the system can recover automatically (i.e.
+    rehydrating the restarted pageserver from remote storage) without manual intervention. The
+    pageserver indicates to the storage controller that it has restarted without any local tenant
+    data in its "reattach" request and the storage controller uses this information to detect the
+    data loss condition and reconfigure the pageserver as necessary.
+    """
+    env = neon_env_builder.init_configs()
+    env.broker.start()
+    env.storage_controller.start(handle_ps_local_disk_loss=True)
+    env.pageserver.start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # create new nenant
+    tenant_id, _ = env.create_tenant(shard_count=4)
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+        cur.execute("CREATE DATABASE testdb")
+
+    with endpoint.cursor(dbname="testdb") as cur:
+        cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
+    endpoint.stop()
+
+    # Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
+    # that restarted with the same ID but has lost all its local disk data.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(env.pageserver.tenant_dir())
+    env.pageserver.start()
+
+    # Test that the endpoint can start and query the database after the pageserver restarts. This
+    # indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
+    # storage automatically.
+    endpoint.start()
+    with endpoint.cursor(dbname="testdb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING

 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.utils import USE_LFC, query_scalar

 if TYPE_CHECKING:
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
    # verify working set size after some index access of a few select pages only
-    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    blocks = query_scalar(cur, "select approximate_working_set_size(false)")
    log.info(f"working set size after some index access of a few select pages only {blocks}")
    assert blocks < 20

+    # Also test the metrics from the /autoscaling_metrics endpoint
+    autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
+    log.debug(f"Raw metrics: {autoscaling_metrics}")
+    m = parse_metrics(autoscaling_metrics)
+
+    http_estimate = m.query_one(
+        "lfc_approximate_working_set_size_windows",
+        {
+            "duration_seconds": "60",
+        },
+    ).value
+    log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
+    assert http_estimate > 0 and http_estimate < 20
+

@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -103,3 +103,90 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
        query = "DROP SUBSCRIPTION sub CASCADE"
        log.info(f"Dropping subscription: {query}")
        cur.execute(query)
+
+
+def test_privileged_role_override(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    """
+    Test that we can override the privileged role for an endpoint and when we do it,
+    everything is correctly bootstrapped inside Postgres and we don't have neon_superuser
+    role in the database.
+    """
+    PRIVILEGED_ROLE_NAME = "my_superuser"
+
+    env = neon_simple_env
+    env.create_branch("test_privileged_role_override")
+    ep = env.endpoints.create(
+        "test_privileged_role_override",
+        privileged_role_name=PRIVILEGED_ROLE_NAME,
+        update_catalog=True,
+    )
+
+    ep.start()
+
+    ep.wait_for_migrations()
+
+    member_roles = [
+        "pg_read_all_data",
+        "pg_write_all_data",
+        "pg_monitor",
+        "pg_signal_backend",
+    ]
+
+    non_member_roles = [
+        "pg_execute_server_program",
+        "pg_read_server_files",
+        "pg_write_server_files",
+    ]
+
+    role_attributes = {
+        "rolsuper": False,
+        "rolinherit": True,
+        "rolcreaterole": True,
+        "rolcreatedb": True,
+        "rolcanlogin": False,
+        "rolreplication": True,
+        "rolconnlimit": -1,
+        "rolbypassrls": True,
+    }
+
+    if pg_version >= PgVersion.V15:
+        non_member_roles.append("pg_checkpoint")
+
+    if pg_version >= PgVersion.V16:
+        member_roles.append("pg_create_subscription")
+        non_member_roles.append("pg_use_reserved_connections")
+
+    with ep.cursor() as cur:
+        cur.execute(f"SELECT rolname FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
+        assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
+
+        cur.execute("SELECT rolname FROM pg_roles WHERE rolname = 'neon_superuser'")
+        assert len(cur.fetchall()) == 0
+
+        cur.execute("SHOW neon.privileged_role_name")
+        assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
+
+        # check PRIVILEGED_ROLE_NAME role is created
+        cur.execute(f"select * from pg_roles where rolname = '{PRIVILEGED_ROLE_NAME}'")
+        assert cur.fetchone() is not None
+
+        # check PRIVILEGED_ROLE_NAME role has the correct member roles
+        for role in member_roles:
+            cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
+            assert cur.fetchone() == (True,), (
+                f"Role {role} should be a member of {PRIVILEGED_ROLE_NAME}"
+            )
+
+        for role in non_member_roles:
+            cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
+            assert cur.fetchone() == (False,), (
+                f"Role {role} should not be a member of {PRIVILEGED_ROLE_NAME}"
+            )
+
+        # check PRIVILEGED_ROLE_NAME role has the correct role attributes
+        for attr, val in role_attributes.items():
+            cur.execute(f"SELECT {attr} FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
+            curr_val = cur.fetchone()
+            assert curr_val == (val,), (
+                f"Role attribute {attr} should be {val} instead of {curr_val}"
+            )
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -246,9 +246,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):

    system_memory = psutil.virtual_memory().total

-    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
-    # a system with 128GB of RAM).  We will then write enough data to violate this limit.
-    max_dirty_data = 128 * 1024 * 1024
+    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 256MB on
+    # a system with 256GB of RAM).  We will then write enough data to violate this limit.
+    max_dirty_data = 256 * 1024 * 1024
    ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
    assert ephemeral_bytes_per_memory_kb > 0

@@ -272,7 +272,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
    timeline_count = 10

    # This is about 2MiB of data per timeline
-    entries_per_timeline = 100_000
+    entries_per_timeline = 200_000

    last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,6 +3,7 @@
 #
 from __future__ import annotations

+import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any, cast

@@ -356,6 +357,81 @@ def test_sql_regress(
    post_checks(env, test_output_dir, DBNAME, endpoint)


+def test_max_wal_rate(neon_simple_env: NeonEnv):
+    """
+    Test the databricks.max_wal_mb_per_second GUC and how it affects WAL rate
+    limiting.
+    """
+    env = neon_simple_env
+
+    DBNAME = "regression"
+    superuser_name = "databricks_superuser"
+
+    # Connect to postgres and create a database called "regression".
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # we need this option because default max_cluster_size < 0 will disable throttling completely
+            "neon.max_cluster_size=10GB",
+        ],
+    )
+
+    endpoint.safe_psql_many(
+        [
+            f"CREATE ROLE {superuser_name}",
+            f"CREATE DATABASE {DBNAME}",
+            "CREATE EXTENSION neon",
+        ]
+    )
+
+    endpoint.safe_psql("CREATE TABLE usertable (YCSB_KEY INT, FIELD0 TEXT);", dbname=DBNAME)
+
+    # Write ~1 MB data.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        for _ in range(0, 1000):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    # No backpressure
+    tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
+    assert tuples[0][0] == 0, "Backpressure throttling detected"
+
+    # 0 MB/s max_wal_rate. WAL proposer can still push some WALs but will be super slow.
+    endpoint.safe_psql_many(
+        [
+            "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 0;",
+            "SELECT pg_reload_conf();",
+        ]
+    )
+
+    # Write ~10 KB data should hit backpressure.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        cur.execute("SET databricks.max_wal_mb_per_second = 0;")
+        for _ in range(0, 10):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
+    assert tuples[0][0] > 0, "No backpressure throttling detected"
+
+    # 1 MB/s max_wal_rate.
+    endpoint.safe_psql_many(
+        [
+            "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 1;",
+            "SELECT pg_reload_conf();",
+        ]
+    )
+
+    # Write 10 MB data.
+    with endpoint.cursor(dbname=DBNAME) as cur:
+        start = int(time.time())
+        for _ in range(0, 10000):
+            cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
+
+    end = int(time.time())
+    assert end - start >= 10, (
+        "Throttling should cause the previous inserts to take greater than or equal to 10 seconds"
+    )
+
+
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_tx_abort_with_many_relations(
--- a/test_runner/regress/test_rest_broker.py
+++ b/test_runner/regress/test_rest_broker.py
@@ -0,0 +1,137 @@
+import json
+import signal
+import time
+
+import requests
+from fixtures.utils import skip_if_proxy_lacks_rest_broker
+from jwcrypto import jwt
+
+
+@skip_if_proxy_lacks_rest_broker()
+def test_rest_broker_happy(
+    local_proxy_fixed_port, rest_broker_proxy, vanilla_pg, neon_authorize_jwk, httpserver
+):
+    """Test REST API endpoint using local_proxy and rest_broker_proxy."""
+
+    # Use the fixed port local proxy
+    local_proxy = local_proxy_fixed_port
+
+    # Create the required roles for PostgREST authentication
+    vanilla_pg.safe_psql("CREATE ROLE authenticator LOGIN")
+    vanilla_pg.safe_psql("CREATE ROLE authenticated")
+    vanilla_pg.safe_psql("CREATE ROLE anon")
+    vanilla_pg.safe_psql("GRANT authenticated TO authenticator")
+    vanilla_pg.safe_psql("GRANT anon TO authenticator")
+
+    # Create the pgrst schema and configuration function required by the rest broker
+    vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS pgrst")
+    vanilla_pg.safe_psql("""
+        CREATE OR REPLACE FUNCTION pgrst.pre_config()
+        RETURNS VOID AS $$
+          SELECT
+              set_config('pgrst.db_schemas', 'test', true)
+            , set_config('pgrst.db_aggregates_enabled', 'true', true)
+            , set_config('pgrst.db_anon_role', 'anon', true)
+            , set_config('pgrst.jwt_aud', '', true)
+            , set_config('pgrst.jwt_secret', '', true)
+            , set_config('pgrst.jwt_role_claim_key', '."role"', true)
+
+        $$ LANGUAGE SQL;
+    """)
+    vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA pgrst TO authenticator")
+    vanilla_pg.safe_psql("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA pgrst TO authenticator")
+
+    # Bootstrap the database with test data
+    vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS test")
+    vanilla_pg.safe_psql("""
+        CREATE TABLE IF NOT EXISTS test.items (
+            id SERIAL PRIMARY KEY,
+            name TEXT NOT NULL
+        )
+    """)
+    vanilla_pg.safe_psql("INSERT INTO test.items (name) VALUES ('test_item')")
+
+    # Grant access to the test schema for the authenticated role
+    vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA test TO authenticated")
+    vanilla_pg.safe_psql("GRANT SELECT ON ALL TABLES IN SCHEMA test TO authenticated")
+
+    # Set up HTTP server to serve JWKS (like static_auth_broker)
+    # Generate public key from the JWK
+    public_key = neon_authorize_jwk.export_public(as_dict=True)
+
+    # Set up the httpserver to serve the JWKS
+    httpserver.expect_request("/.well-known/jwks.json").respond_with_json({"keys": [public_key]})
+
+    # Create JWKS configuration for the rest broker proxy
+    jwks_config = {
+        "jwks": [
+            {
+                "id": "1",
+                "role_names": ["authenticator", "authenticated", "anon"],
+                "jwks_url": httpserver.url_for("/.well-known/jwks.json"),
+                "provider_name": "foo",
+                "jwt_audience": None,
+            }
+        ]
+    }
+
+    # Write the JWKS config to the config file that rest_broker_proxy expects
+    config_file = rest_broker_proxy.config_path
+    with open(config_file, "w") as f:
+        json.dump(jwks_config, f)
+
+    # Write the same config to the local_proxy config file
+    local_config_file = local_proxy.config_path
+    with open(local_config_file, "w") as f:
+        json.dump(jwks_config, f)
+
+    # Signal both proxies to reload their config
+    if rest_broker_proxy._popen is not None:
+        rest_broker_proxy._popen.send_signal(signal.SIGHUP)
+    if local_proxy._popen is not None:
+        local_proxy._popen.send_signal(signal.SIGHUP)
+    # Wait a bit for config to reload
+    time.sleep(0.5)
+
+    # Generate a proper JWT token using the JWK (similar to test_auth_broker.py)
+    token = jwt.JWT(
+        header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"},
+        claims={
+            "sub": "user",
+            "role": "authenticated",  # role that's in role_names
+            "exp": 9999999999,  # expires far in the future
+            "iat": 1000000000,  # issued at
+        },
+    )
+    token.make_signed_token(neon_authorize_jwk)
+
+    # Debug: Print the JWT claims and config for troubleshooting
+    print(f"JWT claims: {token.claims}")
+    print(f"JWT header: {token.header}")
+    print(f"Config file contains: {jwks_config}")
+    print(f"Public key kid: {public_key.get('kid')}")
+
+    # Test REST API call - following SUBZERO.md pattern
+    # REST API is served on the WSS port with HTTPS and includes database name
+    # ep-purple-glitter-adqior4l-pooler.c-2.us-east-1.aws.neon.tech
+    url = f"https://foo.apirest.c-2.local.neon.build:{rest_broker_proxy.wss_port}/postgres/rest/v1/items"
+
+    response = requests.get(
+        url,
+        headers={
+            "Authorization": f"Bearer {token.serialize()}",
+        },
+        params={"id": "eq.1", "select": "name"},
+        verify=False,  # Skip SSL verification for self-signed certs
+    )
+
+    print(f"Response status: {response.status_code}")
+    print(f"Response headers: {response.headers}")
+    print(f"Response body: {response.text}")
+
+    # For now, let's just check that we get some response
+    # We can refine the assertions once we see what the actual response looks like
+    assert response.status_code in [200]  # Any response means the proxies are working
+
+    # check the response body
+    assert response.json() == [{"name": "test_item"}]
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -3,11 +3,22 @@ from __future__ import annotations
 from typing import TYPE_CHECKING

 import pytest
+import requests
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import StorageControllerApiException

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder

+# TODO(diko): pageserver spams with various errors during safekeeper migration.
+# Fix the code so it handles the migration better.
+ALLOWED_PAGESERVER_ERRORS = [
+    ".*Timeline .* was cancelled and cannot be used anymore.*",
+    ".*Timeline .* has been deleted.*",
+    ".*Timeline .* was not found in global map.*",
+    ".*wal receiver task finished with an error.*",
+]
+

 def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
    """
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
        "timeline_safekeeper_count": 1,
    }
    env = neon_env_builder.init_start()
-    # TODO(diko): pageserver spams with various errors during safekeeper migration.
-    # Fix the code so it handles the migration better.
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was cancelled and cannot be used anymore.*",
-            ".*Timeline .* has been deleted.*",
-            ".*Timeline .* was not found in global map.*",
-            ".*wal receiver task finished with an error.*",
-        ]
-    )
+    env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)

    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)

@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
    assert len(mconf["sk_set"]) == 1
    assert mconf["generation"] == 1

+    current_sk = mconf["sk_set"][0]
+
    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
    ep.safe_psql("CREATE TABLE t(a int)")

+    expected_gen = 1
+
    for active_sk in range(1, 4):
        env.storage_controller.migrate_safekeepers(
            env.initial_tenant, env.initial_timeline, [active_sk]
        )

+        if active_sk != current_sk:
+            expected_gen += 2
+            current_sk = active_sk
+
        other_sks = [sk for sk in range(1, 4) if sk != active_sk]

        for sk in other_sks:
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):

    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]

-    # 1 initial generation + 2 migrations on each loop iteration.
-    expected_gen = 1 + 2 * 3
-
    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
    assert mconf["generation"] == expected_gen

@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
    env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")

    expect_fail([sk_set[0], decom_sk], "decomissioned")
+
+
+def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper migration handles failures well.
+
+    Two main conditions are checked:
+    1. safekeeper migration handler can be retried on different failures.
+    2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
+    """
+    neon_env_builder.num_safekeepers = 4
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 3,
+    }
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert len(mconf["sk_set"]) == 3
+    assert mconf["generation"] == 1
+
+    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
+    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
+    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
+    ep.safe_psql("CREATE TABLE t(a int)")
+
+    excluded_sk = mconf["sk_set"][-1]
+    added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
+    new_sk_set = mconf["sk_set"][:-1] + [added_sk]
+    log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
+
+    failpoints = [
+        "sk-migration-after-step-3",
+        "sk-migration-after-step-4",
+        "sk-migration-after-step-5",
+        "sk-migration-after-step-7",
+        "sk-migration-after-step-8",
+        "sk-migration-step-9-after-set-membership",
+        "sk-migration-step-9-mid-exclude",
+        "sk-migration-step-9-after-exclude",
+        "sk-migration-after-step-9",
+    ]
+
+    for i, fp in enumerate(failpoints):
+        env.storage_controller.configure_failpoints((fp, "return(1)"))
+
+        with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
+            env.storage_controller.migrate_safekeepers(
+                env.initial_tenant, env.initial_timeline, new_sk_set
+            )
+        ep.safe_psql(f"INSERT INTO t VALUES ({i})")
+
+        env.storage_controller.configure_failpoints((fp, "off"))
+
+    # No failpoints, migration should succeed.
+    env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert mconf["new_sk_set"] is None
+    assert mconf["sk_set"] == new_sk_set
+    assert mconf["generation"] == 3
+
+    ep.clear_buffers()
+    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
+    assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
+
+    # Check that we didn't forget to remove the timeline on the excluded safekeeper.
+    with pytest.raises(requests.exceptions.HTTPError) as exc:
+        env.safekeepers[excluded_sk - 1].http_client().timeline_status(
+            env.initial_tenant, env.initial_timeline
+        )
+    assert exc.value.response.status_code == 404
+    assert (
+        f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
+    )
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1810,6 +1810,8 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
            "config_lines": [
                # Tip: set to 100MB to make the test fail
                "max_replication_write_lag=1MB",
+                # Hadron: Need to set max_cluster_size to some value to enable any backpressure at all.
+                "neon.max_cluster_size=1GB",
            ],
            # We need `neon` extension for calling backpressure functions,
            # this flag instructs `compute_ctl` to pre-install it.
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
 import fixtures.utils
 import pytest
 from fixtures.auth_tokens import TokenScope
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    DEFAULT_AZ_ID,
@@ -47,6 +47,7 @@ from fixtures.utils import (
    wait_until,
 )
 from fixtures.workload import Workload
+from requests.adapters import HTTPAdapter
 from urllib3 import Retry
 from werkzeug.wrappers.response import Response

@@ -72,6 +73,12 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
    return counts


+class DeletionAPIKind(Enum):
+    OLD = "old"
+    FORCE = "force"
+    GRACEFUL = "graceful"
+
+
@pytest.mark.parametrize(**fixtures.utils.allpairs_versions())
 def test_storage_controller_smoke(
    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination
@@ -990,7 +997,7 @@ def test_storage_controller_compute_hook_retry(


@run_only_on_default_postgres("postgres behavior is not relevant")
-def test_storage_controller_compute_hook_keep_failing(
+def test_storage_controller_compute_hook_stuck_reconciles(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
    httpserver_listen_address: ListenAddress,
@@ -1040,7 +1047,7 @@ def test_storage_controller_compute_hook_keep_failing(
    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
    env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
-    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
+    env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
    env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})

    # Migrate all allowed tenant shards to the first alive pageserver
@@ -1055,7 +1062,7 @@ def test_storage_controller_compute_hook_keep_failing(

    # Make some reconcile_all calls to trigger optimizations
    # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
-    RECONCILE_COUNT = 12
+    RECONCILE_COUNT = 20
    for i in range(RECONCILE_COUNT):
        try:
            n = env.storage_controller.reconcile_all()
@@ -1068,6 +1075,8 @@ def test_storage_controller_compute_hook_keep_failing(
        assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
        time.sleep(2)

+    env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
+
    # Check that the allowed tenant shards are optimized due to affinity rules
    locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
    not_optimized_shard_count = 0
@@ -2572,9 +2581,11 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):


@pytest.mark.parametrize("while_offline", [True, False])
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
 def test_storage_controller_node_deletion(
    neon_env_builder: NeonEnvBuilder,
    while_offline: bool,
+    deletion_api: DeletionAPIKind,
 ):
    """
    Test that deleting a node works & properly reschedules everything that was on the node.
@@ -2598,6 +2609,8 @@ def test_storage_controller_node_deletion(
    assert env.storage_controller.reconcile_all() == 0

    victim = env.pageservers[-1]
+    if deletion_api == DeletionAPIKind.FORCE and not while_offline:
+        victim.allowed_errors.append(".*request was dropped before completing.*")

    # The procedure a human would follow is:
    # 1. Mark pageserver scheduling=pause
@@ -2621,7 +2634,12 @@ def test_storage_controller_node_deletion(
        wait_until(assert_shards_migrated)

    log.info(f"Deleting pageserver {victim.id}")
-    env.storage_controller.node_delete_old(victim.id)
+    if deletion_api == DeletionAPIKind.FORCE:
+        env.storage_controller.node_delete(victim.id, force=True)
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(victim.id)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")

    if not while_offline:

@@ -2634,7 +2652,15 @@ def test_storage_controller_node_deletion(
        wait_until(assert_victim_evacuated)

    # The node should be gone from the list API
-    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    def assert_node_is_gone():
+        assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    if deletion_api == DeletionAPIKind.FORCE:
+        wait_until(assert_node_is_gone)
+    elif deletion_api == DeletionAPIKind.OLD:
+        assert_node_is_gone()
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")

    # No tenants should refer to the node in their intent
    for tenant_id in tenant_ids:
@@ -2656,7 +2682,11 @@ def test_storage_controller_node_deletion(
    env.storage_controller.consistency_check()


-def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL])
+def test_storage_controller_node_delete_cancellation(
+    neon_env_builder: NeonEnvBuilder,
+    deletion_api: DeletionAPIKind,
+):
    neon_env_builder.num_pageservers = 3
    neon_env_builder.num_azs = 3
    env = neon_env_builder.init_configs()
@@ -2680,12 +2710,16 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
    assert len(nodes) == 3

    env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "pause"))

    ps_id_to_delete = env.pageservers[0].id

    env.storage_controller.warm_up_all_secondaries()
+
+    assert deletion_api in [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL]
+    force = deletion_api == DeletionAPIKind.FORCE
    env.storage_controller.retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_delete(ps_id),
+        lambda ps_id: env.storage_controller.node_delete(ps_id, force),
        ps_id_to_delete,
        max_attempts=3,
        backoff=2,
@@ -2701,6 +2735,8 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu

    env.storage_controller.cancel_node_delete(ps_id_to_delete)

+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "off"))
+
    env.storage_controller.poll_node_status(
        ps_id_to_delete,
        PageserverAvailability.ACTIVE,
@@ -3252,7 +3288,10 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
    wait_until(reconfigure_node_again)


-def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
+def test_ps_unavailable_after_delete(
+    neon_env_builder: NeonEnvBuilder, deletion_api: DeletionAPIKind
+):
    neon_env_builder.num_pageservers = 3

    env = neon_env_builder.init_start()
@@ -3265,10 +3304,16 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
    assert_nodes_count(3)

    ps = env.pageservers[0]
-    env.storage_controller.node_delete_old(ps.id)

-    # After deletion, the node count must be reduced
-    assert_nodes_count(2)
+    if deletion_api == DeletionAPIKind.FORCE:
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+        env.storage_controller.node_delete(ps.id, force=True)
+        wait_until(lambda: assert_nodes_count(2))
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(ps.id)
+        assert_nodes_count(2)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")

    # Running pageserver CLI init in a separate thread
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
@@ -4814,3 +4859,103 @@ def test_storage_controller_migrate_with_pageserver_restart(
        "shards": [{"node_id": int(secondary.id), "shard_number": 0}],
        "preferred_az": DEFAULT_AZ_ID,
    }
+
+
+@run_only_on_default_postgres("PG version is not important for this test")
+def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
+    """
+    Ensures that the storage controller correctly forwards 404s and converts some of them
+    into 503s before forwarding to the client.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_azs = 2
+
+    env = neon_env_builder.init_start()
+    env.storage_controller.allowed_errors.append(".*Reconcile error.*")
+    env.storage_controller.allowed_errors.append(".*Timed out.*")
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    # 404s on tenants and timelines are forwarded as-is when reconciler is not running.
+
+    # Access a non-existing timeline -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            env.initial_tenant, TimelineId.generate()
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            env.initial_tenant, TimelineId.generate(), Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Access a non-existing tenant when reconciler is not running -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            TenantId.generate(), env.initial_timeline
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            TenantId.generate(), env.initial_timeline, Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Normal requests should succeed
+    detail = env.storage_controller.pageserver_api().timeline_detail(
+        env.initial_tenant, env.initial_timeline
+    )
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    env.storage_controller.pageserver_api().timeline_lsn_lease(
+        env.initial_tenant, env.initial_timeline, last_record_lsn
+    )
+
+    # Get into a situation where the intent state is not the same as the observed state.
+    describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
+    current_primary = describe["node_attached"]
+    current_secondary = describe["node_secondary"][0]
+    assert current_primary != current_secondary
+
+    # Pause the reconciler so that the generation number won't be updated.
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "pause")
+    )
+
+    # Do the migration in another thread; the request will be dropped as we don't wait.
+    shard_zero = TenantShardId(env.initial_tenant, 0, 0)
+    concurrent.futures.ThreadPoolExecutor(max_workers=1).submit(
+        env.storage_controller.tenant_shard_migrate,
+        shard_zero,
+        current_secondary,
+        StorageControllerMigrationConfig(override_scheduler=True),
+    )
+    # Not the best way to do this, we should wait until the migration gets started.
+    time.sleep(1)
+    placement = env.storage_controller.get_tenants_placement()[str(shard_zero)]
+    assert placement["observed"] != placement["intent"]
+    assert placement["observed"]["attached"] == current_primary
+    assert placement["intent"]["attached"] == current_secondary
+
+    # Now we issue requests that would cause 404 again
+    retry_strategy = Retry(total=0)
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+
+    no_retry_api = env.storage_controller.pageserver_api()
+    no_retry_api.mount("http://", adapter)
+    no_retry_api.mount("https://", adapter)
+
+    # As intent state != observed state, tenant not found error should return 503,
+    # so that the client can retry once we've successfully migrated.
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate())
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0))
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+
+    # Unblock reconcile operations
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "off")
+    )
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -76,7 +76,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 3

    env = neon_env_builder.init_start()
-    """Tests tenants with and without wal acceptors"""
    tenant_1, _ = env.create_tenant()
    tenant_2, _ = env.create_tenant()

--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):

    # Wait for the error message to appear in the compute log
    def error_logged():
-        return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
+        if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None:
+            raise Exception("Expected error message not found in compute log yet")

    wait_until(error_logged)
    log.info("Found expected error message in compute log, resuming.")
@@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
            cur.execute("select count(*) from t")
            # 2000 rows from first insert + 1000 from last insert
            assert cur.fetchone() == (3000,)
+
+
+def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
+    """
+    Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker
+    also works as expected. The test scenario:
+    1. Create a timeline and endpoint.
+    2. Mock high disk usage via failpoint
+    3. Write data to the timeline so that disk usage exceeds the limit.
+    4. Verify that the writes hang and the expected error message appears in the compute log.
+    5. Mock low disk usage via failpoint
+    6. Verify that the hanging writes unblock and we can continue to write as normal.
+    """
+    neon_env_builder.num_safekeepers = 1
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    env.create_branch("test_global_disk_usage_limit")
+    endpoint = env.endpoints.create_start("test_global_disk_usage_limit")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t2(key int, value text)")
+
+    for sk in env.safekeepers:
+        sk.stop().start(
+            extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"]
+        )
+
+    # Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical
+    # limits in the test environment.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints(
+            [("sk-global-disk-usage", "return(18446744073709551615)")]
+        )
+
+    # Wait until the global disk usage limit watcher trips the circuit breaker.
+    def error_logged_in_sk():
+        for sk in env.safekeepers:
+            if sk.log_contains("Global disk usage exceeded limit") is None:
+                raise Exception("Expected error message not found in safekeeper log yet")
+
+    wait_until(error_logged_in_sk)
+
+    def run_hanging_insert_global():
+        with closing(endpoint.connect()) as bg_conn:
+            with bg_conn.cursor() as bg_cur:
+                # This should generate more than 1KiB of WAL
+                bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'")
+
+    bg_thread_global = threading.Thread(target=run_hanging_insert_global)
+    bg_thread_global.start()
+
+    def error_logged_in_compute():
+        if endpoint.log_contains("Global disk usage exceeded limit") is None:
+            raise Exception("Expected error message not found in compute log yet")
+
+    wait_until(error_logged_in_compute)
+    log.info("Found the expected error message in compute log, resuming.")
+
+    time.sleep(2)
+    assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!"
+
+    # Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing.
+    # The SKs should resume accepting WAL writes without restarting.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")])
+
+    bg_thread_global.join(timeout=120)
+    assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart"
+    log.info("Global hanging insert unblocked.")
+
+    # Verify that we can continue to write as normal and we don't have obvious data corruption
+    # following the recovery.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("insert into t2 select generate_series(2001,3000), 'payload'")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from t2")
+            assert cur.fetchone() == (3000,)
--- a/test_runner/sql_regress/expected/neon-spgist.out
+++ b/test_runner/sql_regress/expected/neon-spgist.out
@@ -0,0 +1,9 @@
+-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error)
+create table spgist_point_tbl(id int4, p point);
+create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25);
+insert into spgist_point_tbl (id, p)        select g,      point(g*10, g*10) from generate_series(1, 10000) g;
+insert into spgist_point_tbl (id, p)        select g,      point(g*10, g*10) from generate_series(1, 10000) g;
+insert into spgist_point_tbl (id, p)        select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
+vacuum spgist_point_tbl;
+insert into spgist_point_tbl (id, p)        select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
+checkpoint;
--- a/test_runner/sql_regress/parallel_schedule
+++ b/test_runner/sql_regress/parallel_schedule
@@ -9,5 +9,6 @@ test: neon-rel-truncate
 test: neon-clog
 test: neon-test-utils
 test: neon-vacuum-full
-test: neon-event-triggers
 test: neon-subxacts
+test: neon-spgist
+test: neon-event-triggers
--- a/test_runner/sql_regress/sql/neon-spgist.sql
+++ b/test_runner/sql_regress/sql/neon-spgist.sql
@@ -0,0 +1,10 @@
+-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error)
+create table spgist_point_tbl(id int4, p point);
+create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25);
+insert into spgist_point_tbl (id, p)        select g,      point(g*10, g*10) from generate_series(1, 10000) g;
+insert into spgist_point_tbl (id, p)        select g,      point(g*10, g*10) from generate_series(1, 10000) g;
+insert into spgist_point_tbl (id, p)        select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
+
+vacuum spgist_point_tbl;
+insert into spgist_point_tbl (id, p)        select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
+checkpoint;