mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 20:10:38 +00:00
Accept primary compute spec in /promote, promotion corner cases testing (#12574)
https://github.com/neondatabase/cloud/issues/19011 - Accept `ComputeSpec` in `/promote` instead of just passing safekeepers and LSN. Update API spec - Add corner case tests for promotion when promotion or perwarm fails (using failpoints) - Print root error for prewarm and promotion in status handlers
This commit is contained in:
@@ -87,9 +87,10 @@ class EndpointHttpClient(requests.Session):
|
||||
def prewarmed():
|
||||
json = self.prewarm_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
assert status in ["failed", "completed", "skipped"], f"{status}, {err=}"
|
||||
|
||||
wait_until(prewarmed, timeout=60)
|
||||
assert self.prewarm_lfc_status()["status"] != "failed"
|
||||
|
||||
def offload_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.offload_url)
|
||||
@@ -105,19 +106,19 @@ class EndpointHttpClient(requests.Session):
|
||||
def offloaded():
|
||||
json = self.offload_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
assert status in ["failed", "completed"], f"{status}, {err=}"
|
||||
|
||||
wait_until(offloaded)
|
||||
assert self.offload_lfc_status()["status"] != "failed"
|
||||
|
||||
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
|
||||
def promote(self, promote_spec: dict[str, Any], disconnect: bool = False):
|
||||
url = f"http://localhost:{self.external_port}/promote"
|
||||
if disconnect:
|
||||
try: # send first request to start promote and disconnect
|
||||
self.post(url, data=safekeepers_lsn, timeout=0.001)
|
||||
self.post(url, json=promote_spec, timeout=0.001)
|
||||
except ReadTimeout:
|
||||
pass # wait on second request which returns on promotion finish
|
||||
res = self.post(url, data=safekeepers_lsn)
|
||||
res.raise_for_status()
|
||||
res = self.post(url, json=promote_spec)
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
|
||||
@@ -4794,9 +4794,10 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
m = re.search(r"=\s*(\S+)", line)
|
||||
assert m is not None, f"malformed config line {line}"
|
||||
size = m.group(1)
|
||||
assert size_to_bytes(size) >= size_to_bytes("1MB"), (
|
||||
"LFC size cannot be set less than 1MB"
|
||||
)
|
||||
if size_to_bytes(size) > 0:
|
||||
assert size_to_bytes(size) >= size_to_bytes("1MB"), (
|
||||
"LFC size cannot be set less than 1MB"
|
||||
)
|
||||
lfc_path_escaped = str(lfc_path).replace("'", "''")
|
||||
config_lines = [
|
||||
f"neon.file_cache_path = '{lfc_path_escaped}'",
|
||||
@@ -4951,6 +4952,10 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
log.debug(json.dumps(dict(data_dict, **kwargs)))
|
||||
json.dump(dict(data_dict, **kwargs), file, indent=4)
|
||||
|
||||
def get_compute_spec(self) -> dict[str, Any]:
|
||||
out = json.loads((Path(self.endpoint_path()) / "config.json").read_text())["spec"]
|
||||
return cast("dict[str, Any]", out)
|
||||
|
||||
def respec_deep(self, **kwargs: Any) -> None:
|
||||
"""
|
||||
Update the endpoint.json file taking into account nested keys.
|
||||
|
||||
@@ -164,6 +164,25 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
check_prewarmed(method, client, desired)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_lfc_prewarm_empty(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test there are no errors when trying to offload or prewarm endpoint without cache using compute_ctl.
|
||||
Endpoint without cache is simulated by turning off LFC manually, but in cloud/ setup this is
|
||||
also reproduced on fresh endpoints
|
||||
"""
|
||||
env = neon_simple_env
|
||||
ep = env.endpoints.create_start("main", config_lines=["neon.file_cache_size_limit=0"])
|
||||
client = ep.http_client()
|
||||
conn = ep.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("create schema neon; create extension neon with schema neon")
|
||||
method = PrewarmMethod.COMPUTE_CTL
|
||||
offload_lfc(method, client, cur)
|
||||
prewarm_endpoint(method, client, cur, None)
|
||||
assert client.prewarm_lfc_status()["status"] == "skipped"
|
||||
|
||||
|
||||
# autoprewarm isn't needed as we prewarm manually
|
||||
WORKLOAD_VALUES = METHOD_VALUES[:-1]
|
||||
WORKLOAD_IDS = METHOD_IDS[:-1]
|
||||
|
||||
@@ -90,6 +90,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
primary_spec = primary.get_compute_spec()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
@@ -99,10 +100,9 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
# control plane knows safekeepers, simulate it by querying primary
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn)["status"] == "completed"
|
||||
promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
|
||||
assert client.promote(promote_spec)["status"] == "completed"
|
||||
else:
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
@@ -131,21 +131,35 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
log.info(f"Secondary: LSN after workload is {lsn_triple}")
|
||||
expected_promoted_lsn = Lsn(lsn_triple[2])
|
||||
expected_lsn = Lsn(lsn_triple[2])
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
# compute_ctl's /promote switches replica type to Primary so it syncs
|
||||
# safekeepers on finish
|
||||
stop_and_check_lsn(secondary, expected_promoted_lsn)
|
||||
# compute_ctl's /promote switches replica type to Primary so it syncs safekeepers on finish
|
||||
stop_and_check_lsn(secondary, expected_lsn)
|
||||
else:
|
||||
# on testing postgres, we don't update replica type, secondaries don't
|
||||
# sync so lsn should be None
|
||||
# on testing postgres, we don't update replica type, secondaries don't sync so lsn should be None
|
||||
stop_and_check_lsn(secondary, None)
|
||||
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
secondary.stop()
|
||||
# In production, compute ultimately receives new compute spec from cplane.
|
||||
secondary.respec(mode="Primary")
|
||||
secondary.start()
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute(
|
||||
"INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
|
||||
)
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
log.info(f"Secondary: LSN after restart and workload is {lsn_triple}")
|
||||
expected_lsn = Lsn(lsn_triple[2])
|
||||
stop_and_check_lsn(secondary, expected_lsn)
|
||||
|
||||
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
|
||||
|
||||
with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
|
||||
@@ -154,10 +168,11 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
log.info(f"New primary: Boot LSN is {lsn_triple}")
|
||||
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (200,)
|
||||
compute_ctl_count = 100 * (method == PromoteMethod.COMPUTE_CTL)
|
||||
assert new_primary_cur.fetchone() == (200 + compute_ctl_count,)
|
||||
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (300,)
|
||||
assert new_primary_cur.fetchone() == (300 + compute_ctl_count,)
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
|
||||
@@ -175,18 +190,91 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
|
||||
cur.execute("create schema neon;create extension neon with schema neon")
|
||||
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
|
||||
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
cur.execute("show neon.safekeepers")
|
||||
safekeepers = cur.fetchall()[0][0]
|
||||
|
||||
primary.http_client().offload_lfc()
|
||||
primary_spec = primary.get_compute_spec()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
primary.stop(mode="immediate-terminate")
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
|
||||
promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
|
||||
assert client.promote(promote_spec, disconnect=True)["status"] == "completed"
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (100,)
|
||||
cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (200,)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_replica_promote_fails(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that if a /promote route fails, we can safely start primary back
|
||||
"""
|
||||
env: NeonEnv = neon_simple_env
|
||||
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
|
||||
secondary.stop()
|
||||
secondary.start(env={"FAILPOINTS": "compute-promotion=return(0)"})
|
||||
|
||||
with primary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("create schema neon;create extension neon with schema neon")
|
||||
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
|
||||
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
|
||||
primary.http_client().offload_lfc()
|
||||
primary_spec = primary.get_compute_spec()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
primary.stop(mode="immediate-terminate")
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
|
||||
assert client.promote(promote_spec)["status"] == "failed"
|
||||
secondary.stop()
|
||||
|
||||
primary.start()
|
||||
with primary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (100,)
|
||||
cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (200,)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_replica_promote_prewarm_fails(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that if /lfc/prewarm route fails, we are able to promote
|
||||
"""
|
||||
env: NeonEnv = neon_simple_env
|
||||
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
|
||||
secondary.stop()
|
||||
secondary.start(env={"FAILPOINTS": "compute-prewarm=return(0)"})
|
||||
|
||||
with primary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("create schema neon;create extension neon with schema neon")
|
||||
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
|
||||
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
|
||||
primary.http_client().offload_lfc()
|
||||
primary_spec = primary.get_compute_spec()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
primary.stop(mode="immediate-terminate")
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
|
||||
client = secondary.http_client()
|
||||
with pytest.raises(AssertionError):
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
assert client.prewarm_lfc_status()["status"] == "failed"
|
||||
promote_spec = {"spec": primary_spec, "wal_flush_lsn": str(lsn)}
|
||||
assert client.promote(promote_spec)["status"] == "completed"
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
|
||||
Reference in New Issue
Block a user