Merge commit '108f7ec54' into problame/standby-horizon-leases

2026-01-10 15:02:56 +00:00 · 2025-08-06 17:55:56 +02:00
parent 54480167dc 108f7ec544
commit d22e23f66d
59 changed files with 2030 additions and 356 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
        res.raise_for_status()
        return res.json()

+    def autoscaling_metrics(self):
+        res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
+        res.raise_for_status()
+        log.debug("raw compute metrics: %s", res.text)
+        return res.text
+
    def prewarm_lfc_status(self) -> dict[str, str]:
        res = self.get(self.prewarm_url)
        res.raise_for_status()
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:

 # Some API calls not yet implemented.
 # You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
+@final
 class NeonAPI:
    def __init__(self, neon_api_key: str, neon_api_base_url: str):
        self.__neon_api_key = neon_api_key
@@ -170,7 +171,7 @@ class NeonAPI:
        protected: bool | None = None,
        archived: bool | None = None,
        init_source: str | None = None,
-        add_endpoint=True,
+        add_endpoint: bool = True,
    ) -> dict[str, Any]:
        data: dict[str, Any] = {}
        if add_endpoint:
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ):
        cmd = ["storage_controller", "start"]
        if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
            cmd.append(f"--instance-id={instance_id}")
        if base_port is not None:
            cmd.append(f"--base-port={base_port}")
+        if handle_ps_local_disk_loss is not None:
+            cmd.append(
+                f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
+            )
        return self.raw_cli(cmd)

    def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1940,9 +1940,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ) -> Self:
        assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
        self.running = True
        return self

@@ -2840,10 +2843,13 @@ class NeonProxiedStorageController(NeonStorageController):
        timeout_in_seconds: int | None = None,
        instance_id: int | None = None,
        base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
    ) -> Self:
        assert instance_id is not None and base_port is not None

-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
        self.instances[instance_id] = {"running": True}

        self.running = True
@@ -5799,6 +5805,7 @@ SKIP_FILES = frozenset(
        "postmaster.pid",
        "pg_control",
        "pg_dynshmem",
+        "neon-communicator.socket",
    )
 )

--- a/test_runner/regress/test_communicator_metrics_exporter.py
+++ b/test_runner/regress/test_communicator_metrics_exporter.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import pytest
+import requests
+import requests_unixsocket  # type: ignore [import-untyped]
+from fixtures.metrics import parse_metrics
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
+
+
+def test_communicator_metrics(neon_simple_env: NeonEnv):
+    """
+    Test the communicator's built-in HTTP prometheus exporter
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.start()
+
+    # Change current directory to the data directory, so that we can use
+    # a short relative path to refer to the socket. (There's a 100 char
+    # limitation on the path.)
+    os.chdir(str(endpoint.pgdata_dir))
+    session = requests_unixsocket.Session()
+    r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
+    assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
+
+    # quick test that the endpoint returned something expected. (We don't validate
+    # that the metrics returned are sensible.)
+    m = parse_metrics(r.text)
+    m.query_one("lfc_hits")
+    m.query_one("lfc_misses")
+
+    # Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
+    # expected to unwind and drop the HTTP connection without response, but not
+    # kill the process or the server.
+    with pytest.raises(
+        requests.ConnectionError, match="Remote end closed connection without response"
+    ):
+        r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
+        assert r.status_code == 500
+
+    # Test that subsequent requests after the panic still work.
+    r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
+    assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
+    m = parse_metrics(r.text)
+    m.query_one("lfc_hits")
+    m.query_one("lfc_misses")
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -197,7 +197,7 @@ def test_create_snapshot(
    shutil.copytree(
        test_output_dir,
        new_compatibility_snapshot_dir,
-        ignore=shutil.ignore_patterns("pg_dynshmem"),
+        ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
    )

    log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
--- a/test_runner/regress/test_hcc_handling_ps_data_loss.py
+++ b/test_runner/regress/test_hcc_handling_ps_data_loss.py
@@ -0,0 +1,47 @@
+import shutil
+
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import query_scalar
+
+
+def test_hcc_handling_ps_data_loss(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that following a pageserver local data loss event, the system can recover automatically (i.e.
+    rehydrating the restarted pageserver from remote storage) without manual intervention. The
+    pageserver indicates to the storage controller that it has restarted without any local tenant
+    data in its "reattach" request and the storage controller uses this information to detect the
+    data loss condition and reconfigure the pageserver as necessary.
+    """
+    env = neon_env_builder.init_configs()
+    env.broker.start()
+    env.storage_controller.start(handle_ps_local_disk_loss=True)
+    env.pageserver.start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # create new nenant
+    tenant_id, _ = env.create_tenant(shard_count=4)
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+        cur.execute("CREATE DATABASE testdb")
+
+    with endpoint.cursor(dbname="testdb") as cur:
+        cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
+    endpoint.stop()
+
+    # Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
+    # that restarted with the same ID but has lost all its local disk data.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(env.pageserver.tenant_dir())
+    env.pageserver.start()
+
+    # Test that the endpoint can start and query the database after the pageserver restarts. This
+    # indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
+    # storage automatically.
+    endpoint.start()
+    with endpoint.cursor(dbname="testdb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING

 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.utils import USE_LFC, query_scalar

 if TYPE_CHECKING:
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
    # verify working set size after some index access of a few select pages only
-    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    blocks = query_scalar(cur, "select approximate_working_set_size(false)")
    log.info(f"working set size after some index access of a few select pages only {blocks}")
    assert blocks < 20

+    # Also test the metrics from the /autoscaling_metrics endpoint
+    autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
+    log.debug(f"Raw metrics: {autoscaling_metrics}")
+    m = parse_metrics(autoscaling_metrics)
+
+    http_estimate = m.query_one(
+        "lfc_approximate_working_set_size_windows",
+        {
+            "duration_seconds": "60",
+        },
+    ).value
+    log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
+    assert http_estimate > 0 and http_estimate < 20
+

@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -3,11 +3,22 @@ from __future__ import annotations
 from typing import TYPE_CHECKING

 import pytest
+import requests
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import StorageControllerApiException

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder

+# TODO(diko): pageserver spams with various errors during safekeeper migration.
+# Fix the code so it handles the migration better.
+ALLOWED_PAGESERVER_ERRORS = [
+    ".*Timeline .* was cancelled and cannot be used anymore.*",
+    ".*Timeline .* has been deleted.*",
+    ".*Timeline .* was not found in global map.*",
+    ".*wal receiver task finished with an error.*",
+]
+

 def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
    """
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
        "timeline_safekeeper_count": 1,
    }
    env = neon_env_builder.init_start()
-    # TODO(diko): pageserver spams with various errors during safekeeper migration.
-    # Fix the code so it handles the migration better.
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was cancelled and cannot be used anymore.*",
-            ".*Timeline .* has been deleted.*",
-            ".*Timeline .* was not found in global map.*",
-            ".*wal receiver task finished with an error.*",
-        ]
-    )
+    env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)

    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)

@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
    assert len(mconf["sk_set"]) == 1
    assert mconf["generation"] == 1

+    current_sk = mconf["sk_set"][0]
+
    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
    ep.safe_psql("CREATE TABLE t(a int)")

+    expected_gen = 1
+
    for active_sk in range(1, 4):
        env.storage_controller.migrate_safekeepers(
            env.initial_tenant, env.initial_timeline, [active_sk]
        )

+        if active_sk != current_sk:
+            expected_gen += 2
+            current_sk = active_sk
+
        other_sks = [sk for sk in range(1, 4) if sk != active_sk]

        for sk in other_sks:
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):

    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]

-    # 1 initial generation + 2 migrations on each loop iteration.
-    expected_gen = 1 + 2 * 3
-
    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
    assert mconf["generation"] == expected_gen

@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
    env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")

    expect_fail([sk_set[0], decom_sk], "decomissioned")
+
+
+def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper migration handles failures well.
+
+    Two main conditions are checked:
+    1. safekeeper migration handler can be retried on different failures.
+    2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
+    """
+    neon_env_builder.num_safekeepers = 4
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 3,
+    }
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert len(mconf["sk_set"]) == 3
+    assert mconf["generation"] == 1
+
+    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
+    ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
+    ep.safe_psql("CREATE EXTENSION neon_test_utils;")
+    ep.safe_psql("CREATE TABLE t(a int)")
+
+    excluded_sk = mconf["sk_set"][-1]
+    added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
+    new_sk_set = mconf["sk_set"][:-1] + [added_sk]
+    log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
+
+    failpoints = [
+        "sk-migration-after-step-3",
+        "sk-migration-after-step-4",
+        "sk-migration-after-step-5",
+        "sk-migration-after-step-7",
+        "sk-migration-after-step-8",
+        "sk-migration-step-9-after-set-membership",
+        "sk-migration-step-9-mid-exclude",
+        "sk-migration-step-9-after-exclude",
+        "sk-migration-after-step-9",
+    ]
+
+    for i, fp in enumerate(failpoints):
+        env.storage_controller.configure_failpoints((fp, "return(1)"))
+
+        with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
+            env.storage_controller.migrate_safekeepers(
+                env.initial_tenant, env.initial_timeline, new_sk_set
+            )
+        ep.safe_psql(f"INSERT INTO t VALUES ({i})")
+
+        env.storage_controller.configure_failpoints((fp, "off"))
+
+    # No failpoints, migration should succeed.
+    env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert mconf["new_sk_set"] is None
+    assert mconf["sk_set"] == new_sk_set
+    assert mconf["generation"] == 3
+
+    ep.clear_buffers()
+    assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
+    assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
+
+    # Check that we didn't forget to remove the timeline on the excluded safekeeper.
+    with pytest.raises(requests.exceptions.HTTPError) as exc:
+        env.safekeepers[excluded_sk - 1].http_client().timeline_status(
+            env.initial_tenant, env.initial_timeline
+        )
+    assert exc.value.response.status_code == 404
+    assert (
+        f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
+    )