Accept primary compute spec in /promote, promotion corner cases testing (#12574)

https://github.com/neondatabase/cloud/issues/19011 - Accept `ComputeSpec` in `/promote` instead of just passing safekeepers and LSN. Update API spec - Add corner case tests for promotion when promotion or perwarm fails (using failpoints) - Print root error for prewarm and promotion in status handlers
2026-05-31 20:10:38 +00:00 · 2025-07-23 21:11:34 +01:00
parent 9e6ca2932f
commit a56afee269
10 changed files with 242 additions and 86 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -87,9 +87,10 @@ class EndpointHttpClient(requests.Session):
        def prewarmed():
            json = self.prewarm_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, {err=}"
+            assert status in ["failed", "completed", "skipped"], f"{status}, {err=}"

        wait_until(prewarmed, timeout=60)
+        assert self.prewarm_lfc_status()["status"] != "failed"

    def offload_lfc_status(self) -> dict[str, str]:
        res = self.get(self.offload_url)
@@ -105,19 +106,19 @@ class EndpointHttpClient(requests.Session):
        def offloaded():
            json = self.offload_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, {err=}"
+            assert status in ["failed", "completed"], f"{status}, {err=}"

        wait_until(offloaded)
+        assert self.offload_lfc_status()["status"] != "failed"

-    def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
+    def promote(self, promote_spec: dict[str, Any], disconnect: bool = False):
        url = f"http://localhost:{self.external_port}/promote"
        if disconnect:
            try:  # send first request to start promote and disconnect
-                self.post(url, data=safekeepers_lsn, timeout=0.001)
+                self.post(url, json=promote_spec, timeout=0.001)
            except ReadTimeout:
                pass  # wait on second request which returns on promotion finish
-        res = self.post(url, data=safekeepers_lsn)
-        res.raise_for_status()
+        res = self.post(url, json=promote_spec)
        json: dict[str, str] = res.json()
        return json

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4794,9 +4794,10 @@ class Endpoint(PgProtocol, LogUtils):
                    m = re.search(r"=\s*(\S+)", line)
                    assert m is not None, f"malformed config line {line}"
                    size = m.group(1)
-                    assert size_to_bytes(size) >= size_to_bytes("1MB"), (
-                        "LFC size cannot be set less than 1MB"
-                    )
+                    if size_to_bytes(size) > 0:
+                        assert size_to_bytes(size) >= size_to_bytes("1MB"), (
+                            "LFC size cannot be set less than 1MB"
+                        )
            lfc_path_escaped = str(lfc_path).replace("'", "''")
            config_lines = [
                f"neon.file_cache_path = '{lfc_path_escaped}'",
@@ -4951,6 +4952,10 @@ class Endpoint(PgProtocol, LogUtils):
            log.debug(json.dumps(dict(data_dict, **kwargs)))
            json.dump(dict(data_dict, **kwargs), file, indent=4)

+    def get_compute_spec(self) -> dict[str, Any]:
+        out = json.loads((Path(self.endpoint_path()) / "config.json").read_text())["spec"]
+        return cast("dict[str, Any]", out)
+
    def respec_deep(self, **kwargs: Any) -> None:
        """
        Update the endpoint.json file taking into account nested keys.
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -164,6 +164,25 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    check_prewarmed(method, client, desired)


+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_lfc_prewarm_empty(neon_simple_env: NeonEnv):
+    """
+    Test there are no errors when trying to offload or prewarm endpoint without cache using compute_ctl.
+    Endpoint without cache is simulated by turning off LFC manually, but in cloud/ setup this is
+    also reproduced on fresh endpoints
+    """
+    env = neon_simple_env
+    ep = env.endpoints.create_start("main", config_lines=["neon.file_cache_size_limit=0"])
+    client = ep.http_client()
+    conn = ep.connect()
+    cur = conn.cursor()
+    cur.execute("create schema neon; create extension neon with schema neon")
+    method = PrewarmMethod.COMPUTE_CTL
+    offload_lfc(method, client, cur)
+    prewarm_endpoint(method, client, cur, None)
+    assert client.prewarm_lfc_status()["status"] == "skipped"
+
+
 # autoprewarm isn't needed as we prewarm manually
 WORKLOAD_VALUES = METHOD_VALUES[:-1]
 WORKLOAD_IDS = METHOD_IDS[:-1]
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -90,6 +90,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
        secondary_cur.execute("select count(*) from t")
        assert secondary_cur.fetchone() == (100,)

+    primary_spec = primary.get_compute_spec()
    primary_endpoint_id = primary.endpoint_id
    stop_and_check_lsn(primary, expected_primary_lsn)

@@ -99,10 +100,9 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
    if method == PromoteMethod.COMPUTE_CTL:
        client = secondary.http_client()
        client.prewarm_lfc(primary_endpoint_id)
-        # control plane knows safekeepers, simulate it by querying primary
        assert (lsn := primary.terminate_flush_lsn)
-        safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
-        assert client.promote(safekeepers_lsn)["status"] == "completed"
+        promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
+        assert client.promote(promote_spec)["status"] == "completed"
    else:
        promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
        promo_cur.execute("select pg_reload_conf()")
@@ -131,21 +131,35 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):

        lsn_triple = get_lsn_triple(new_primary_cur)
        log.info(f"Secondary: LSN after workload is {lsn_triple}")
-        expected_promoted_lsn = Lsn(lsn_triple[2])
+        expected_lsn = Lsn(lsn_triple[2])

    with secondary.connect() as conn, conn.cursor() as new_primary_cur:
        new_primary_cur.execute("select payload from t")
        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]

    if method == PromoteMethod.COMPUTE_CTL:
-        # compute_ctl's /promote switches replica type to Primary so it syncs
-        # safekeepers on finish
-        stop_and_check_lsn(secondary, expected_promoted_lsn)
+        # compute_ctl's /promote switches replica type to Primary so it syncs safekeepers on finish
+        stop_and_check_lsn(secondary, expected_lsn)
    else:
-        # on testing postgres, we don't update replica type, secondaries don't
-        # sync so lsn should be None
+        # on testing postgres, we don't update replica type, secondaries don't sync so lsn should be None
        stop_and_check_lsn(secondary, None)

+    if method == PromoteMethod.COMPUTE_CTL:
+        secondary.stop()
+        # In production, compute ultimately receives new compute spec from cplane.
+        secondary.respec(mode="Primary")
+        secondary.start()
+
+        with secondary.connect() as conn, conn.cursor() as new_primary_cur:
+            new_primary_cur.execute(
+                "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
+            )
+            assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
+            lsn_triple = get_lsn_triple(new_primary_cur)
+            log.info(f"Secondary: LSN after restart and workload is {lsn_triple}")
+            expected_lsn = Lsn(lsn_triple[2])
+        stop_and_check_lsn(secondary, expected_lsn)
+
    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")

    with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
@@ -154,10 +168,11 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
        log.info(f"New primary: Boot LSN is {lsn_triple}")

        new_primary_cur.execute("select count(*) from t")
-        assert new_primary_cur.fetchone() == (200,)
+        compute_ctl_count = 100 * (method == PromoteMethod.COMPUTE_CTL)
+        assert new_primary_cur.fetchone() == (200 + compute_ctl_count,)
        new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
        new_primary_cur.execute("select count(*) from t")
-        assert new_primary_cur.fetchone() == (300,)
+        assert new_primary_cur.fetchone() == (300 + compute_ctl_count,)
    stop_and_check_lsn(primary, expected_primary_lsn)


@@ -175,18 +190,91 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
        cur.execute("create schema neon;create extension neon with schema neon")
        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
-        cur.execute("show neon.safekeepers")
-        safekeepers = cur.fetchall()[0][0]

    primary.http_client().offload_lfc()
+    primary_spec = primary.get_compute_spec()
    primary_endpoint_id = primary.endpoint_id
    primary.stop(mode="immediate-terminate")
    assert (lsn := primary.terminate_flush_lsn)

    client = secondary.http_client()
    client.prewarm_lfc(primary_endpoint_id)
-    safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
-    assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
+    promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
+    assert client.promote(promote_spec, disconnect=True)["status"] == "completed"
+
+    with secondary.connect() as conn, conn.cursor() as cur:
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (100,)
+        cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (200,)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_replica_promote_fails(neon_simple_env: NeonEnv):
+    """
+    Test that if a /promote route fails, we can safely start primary back
+    """
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    secondary.stop()
+    secondary.start(env={"FAILPOINTS": "compute-promotion=return(0)"})
+
+    with primary.connect() as conn, conn.cursor() as cur:
+        cur.execute("create schema neon;create extension neon with schema neon")
+        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
+        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+
+    primary.http_client().offload_lfc()
+    primary_spec = primary.get_compute_spec()
+    primary_endpoint_id = primary.endpoint_id
+    primary.stop(mode="immediate-terminate")
+    assert (lsn := primary.terminate_flush_lsn)
+
+    client = secondary.http_client()
+    client.prewarm_lfc(primary_endpoint_id)
+    promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
+    assert client.promote(promote_spec)["status"] == "failed"
+    secondary.stop()
+
+    primary.start()
+    with primary.connect() as conn, conn.cursor() as cur:
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (100,)
+        cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (200,)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_replica_promote_prewarm_fails(neon_simple_env: NeonEnv):
+    """
+    Test that if /lfc/prewarm route fails, we are able to promote
+    """
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    secondary.stop()
+    secondary.start(env={"FAILPOINTS": "compute-prewarm=return(0)"})
+
+    with primary.connect() as conn, conn.cursor() as cur:
+        cur.execute("create schema neon;create extension neon with schema neon")
+        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
+        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+
+    primary.http_client().offload_lfc()
+    primary_spec = primary.get_compute_spec()
+    primary_endpoint_id = primary.endpoint_id
+    primary.stop(mode="immediate-terminate")
+    assert (lsn := primary.terminate_flush_lsn)
+
+    client = secondary.http_client()
+    with pytest.raises(AssertionError):
+        client.prewarm_lfc(primary_endpoint_id)
+    assert client.prewarm_lfc_status()["status"] == "failed"
+    promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
+    assert client.promote(promote_spec)["status"] == "completed"

    with secondary.connect() as conn, conn.cursor() as cur:
        cur.execute("select count(*) from t")