Merge commit '108f7ec54' into problame/standby-horizon-leases

This commit is contained in:
Christian Schwarz
2025-08-06 17:55:56 +02:00
59 changed files with 2030 additions and 356 deletions

View File

@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()

View File

@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
# Some API calls not yet implemented.
# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
@final
class NeonAPI:
def __init__(self, neon_api_key: str, neon_api_base_url: str):
self.__neon_api_key = neon_api_key
@@ -170,7 +171,7 @@ class NeonAPI:
protected: bool | None = None,
archived: bool | None = None,
init_source: str | None = None,
add_endpoint=True,
add_endpoint: bool = True,
) -> dict[str, Any]:
data: dict[str, Any] = {}
if add_endpoint:

View File

@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
):
cmd = ["storage_controller", "start"]
if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
cmd.append(f"--instance-id={instance_id}")
if base_port is not None:
cmd.append(f"--base-port={base_port}")
if handle_ps_local_disk_loss is not None:
cmd.append(
f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
)
return self.raw_cli(cmd)
def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):

View File

@@ -1940,9 +1940,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert not self.running
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.running = True
return self
@@ -2840,10 +2843,13 @@ class NeonProxiedStorageController(NeonStorageController):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert instance_id is not None and base_port is not None
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.instances[instance_id] = {"running": True}
self.running = True
@@ -5799,6 +5805,7 @@ SKIP_FILES = frozenset(
"postmaster.pid",
"pg_control",
"pg_dynshmem",
"neon-communicator.socket",
)
)

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
import requests
import requests_unixsocket # type: ignore [import-untyped]
from fixtures.metrics import parse_metrics
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
def test_communicator_metrics(neon_simple_env: NeonEnv):
"""
Test the communicator's built-in HTTP prometheus exporter
"""
env = neon_simple_env
endpoint = env.endpoints.create("main")
endpoint.start()
# Change current directory to the data directory, so that we can use
# a short relative path to refer to the socket. (There's a 100 char
# limitation on the path.)
os.chdir(str(endpoint.pgdata_dir))
session = requests_unixsocket.Session()
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
# quick test that the endpoint returned something expected. (We don't validate
# that the metrics returned are sensible.)
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")
# Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
# expected to unwind and drop the HTTP connection without response, but not
# kill the process or the server.
with pytest.raises(
requests.ConnectionError, match="Remote end closed connection without response"
):
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
assert r.status_code == 500
# Test that subsequent requests after the panic still work.
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")

View File

@@ -197,7 +197,7 @@ def test_create_snapshot(
shutil.copytree(
test_output_dir,
new_compatibility_snapshot_dir,
ignore=shutil.ignore_patterns("pg_dynshmem"),
ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
)
log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")

View File

@@ -0,0 +1,47 @@
import shutil
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import query_scalar
def test_hcc_handling_ps_data_loss(
neon_env_builder: NeonEnvBuilder,
):
"""
Test that following a pageserver local data loss event, the system can recover automatically (i.e.
rehydrating the restarted pageserver from remote storage) without manual intervention. The
pageserver indicates to the storage controller that it has restarted without any local tenant
data in its "reattach" request and the storage controller uses this information to detect the
data loss condition and reconfigure the pageserver as necessary.
"""
env = neon_env_builder.init_configs()
env.broker.start()
env.storage_controller.start(handle_ps_local_disk_loss=True)
env.pageserver.start()
for sk in env.safekeepers:
sk.start()
# create new nenant
tenant_id, _ = env.create_tenant(shard_count=4)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
cur.execute("CREATE DATABASE testdb")
with endpoint.cursor(dbname="testdb") as cur:
cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
endpoint.stop()
# Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
# that restarted with the same ID but has lost all its local disk data.
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.tenant_dir())
env.pageserver.start()
# Test that the endpoint can start and query the database after the pageserver restarts. This
# indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
# storage automatically.
endpoint.start()
with endpoint.cursor(dbname="testdb") as cur:
assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100

View File

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.utils import USE_LFC, query_scalar
if TYPE_CHECKING:
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
blocks = query_scalar(cur, "select approximate_working_set_size(false)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 20
# Also test the metrics from the /autoscaling_metrics endpoint
autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
log.debug(f"Raw metrics: {autoscaling_metrics}")
m = parse_metrics(autoscaling_metrics)
http_estimate = m.query_one(
"lfc_approximate_working_set_size_windows",
{
"duration_seconds": "60",
},
).value
log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
assert http_estimate > 0 and http_estimate < 20
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):

View File

@@ -3,11 +3,22 @@ from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
import requests
from fixtures.log_helper import log
from fixtures.neon_fixtures import StorageControllerApiException
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
# TODO(diko): pageserver spams with various errors during safekeeper migration.
# Fix the code so it handles the migration better.
ALLOWED_PAGESERVER_ERRORS = [
".*Timeline .* was cancelled and cannot be used anymore.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was not found in global map.*",
".*wal receiver task finished with an error.*",
]
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
"""
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
"timeline_safekeeper_count": 1,
}
env = neon_env_builder.init_start()
# TODO(diko): pageserver spams with various errors during safekeeper migration.
# Fix the code so it handles the migration better.
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was cancelled and cannot be used anymore.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was not found in global map.*",
".*wal receiver task finished with an error.*",
]
)
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
assert len(mconf["sk_set"]) == 1
assert mconf["generation"] == 1
current_sk = mconf["sk_set"][0]
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
ep.safe_psql("CREATE TABLE t(a int)")
expected_gen = 1
for active_sk in range(1, 4):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, [active_sk]
)
if active_sk != current_sk:
expected_gen += 2
current_sk = active_sk
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
for sk in other_sks:
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
# 1 initial generation + 2 migrations on each loop iteration.
expected_gen = 1 + 2 * 3
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["generation"] == expected_gen
@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
expect_fail([sk_set[0], decom_sk], "decomissioned")
def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
"""
Test that safekeeper migration handles failures well.
Two main conditions are checked:
1. safekeeper migration handler can be retried on different failures.
2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
"""
neon_env_builder.num_safekeepers = 4
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
"timeline_safekeeper_count": 3,
}
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert len(mconf["sk_set"]) == 3
assert mconf["generation"] == 1
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
ep.safe_psql("CREATE TABLE t(a int)")
excluded_sk = mconf["sk_set"][-1]
added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
new_sk_set = mconf["sk_set"][:-1] + [added_sk]
log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
failpoints = [
"sk-migration-after-step-3",
"sk-migration-after-step-4",
"sk-migration-after-step-5",
"sk-migration-after-step-7",
"sk-migration-after-step-8",
"sk-migration-step-9-after-set-membership",
"sk-migration-step-9-mid-exclude",
"sk-migration-step-9-after-exclude",
"sk-migration-after-step-9",
]
for i, fp in enumerate(failpoints):
env.storage_controller.configure_failpoints((fp, "return(1)"))
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, new_sk_set
)
ep.safe_psql(f"INSERT INTO t VALUES ({i})")
env.storage_controller.configure_failpoints((fp, "off"))
# No failpoints, migration should succeed.
env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["new_sk_set"] is None
assert mconf["sk_set"] == new_sk_set
assert mconf["generation"] == 3
ep.clear_buffers()
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
# Check that we didn't forget to remove the timeline on the excluded safekeeper.
with pytest.raises(requests.exceptions.HTTPError) as exc:
env.safekeepers[excluded_sk - 1].http_client().timeline_status(
env.initial_tenant, env.initial_timeline
)
assert exc.value.response.status_code == 404
assert (
f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
)