mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
Merge commit '108f7ec54' into problame/standby-horizon-leases
This commit is contained in:
@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def autoscaling_metrics(self):
|
||||
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
|
||||
res.raise_for_status()
|
||||
log.debug("raw compute metrics: %s", res.text)
|
||||
return res.text
|
||||
|
||||
def prewarm_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.prewarm_url)
|
||||
res.raise_for_status()
|
||||
|
||||
@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
|
||||
|
||||
# Some API calls not yet implemented.
|
||||
# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
|
||||
@final
|
||||
class NeonAPI:
|
||||
def __init__(self, neon_api_key: str, neon_api_base_url: str):
|
||||
self.__neon_api_key = neon_api_key
|
||||
@@ -170,7 +171,7 @@ class NeonAPI:
|
||||
protected: bool | None = None,
|
||||
archived: bool | None = None,
|
||||
init_source: str | None = None,
|
||||
add_endpoint=True,
|
||||
add_endpoint: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
data: dict[str, Any] = {}
|
||||
if add_endpoint:
|
||||
|
||||
@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
):
|
||||
cmd = ["storage_controller", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
cmd.append(f"--instance-id={instance_id}")
|
||||
if base_port is not None:
|
||||
cmd.append(f"--base-port={base_port}")
|
||||
if handle_ps_local_disk_loss is not None:
|
||||
cmd.append(
|
||||
f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
|
||||
)
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
|
||||
|
||||
@@ -1940,9 +1940,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
) -> Self:
|
||||
assert not self.running
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.env.neon_cli.storage_controller_start(
|
||||
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2840,10 +2843,13 @@ class NeonProxiedStorageController(NeonStorageController):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
) -> Self:
|
||||
assert instance_id is not None and base_port is not None
|
||||
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.env.neon_cli.storage_controller_start(
|
||||
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
|
||||
)
|
||||
self.instances[instance_id] = {"running": True}
|
||||
|
||||
self.running = True
|
||||
@@ -5799,6 +5805,7 @@ SKIP_FILES = frozenset(
|
||||
"postmaster.pid",
|
||||
"pg_control",
|
||||
"pg_dynshmem",
|
||||
"neon-communicator.socket",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
54
test_runner/regress/test_communicator_metrics_exporter.py
Normal file
54
test_runner/regress/test_communicator_metrics_exporter.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import requests_unixsocket # type: ignore [import-untyped]
|
||||
from fixtures.metrics import parse_metrics
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
|
||||
|
||||
|
||||
def test_communicator_metrics(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test the communicator's built-in HTTP prometheus exporter
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.start()
|
||||
|
||||
# Change current directory to the data directory, so that we can use
|
||||
# a short relative path to refer to the socket. (There's a 100 char
|
||||
# limitation on the path.)
|
||||
os.chdir(str(endpoint.pgdata_dir))
|
||||
session = requests_unixsocket.Session()
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
|
||||
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
|
||||
|
||||
# quick test that the endpoint returned something expected. (We don't validate
|
||||
# that the metrics returned are sensible.)
|
||||
m = parse_metrics(r.text)
|
||||
m.query_one("lfc_hits")
|
||||
m.query_one("lfc_misses")
|
||||
|
||||
# Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
|
||||
# expected to unwind and drop the HTTP connection without response, but not
|
||||
# kill the process or the server.
|
||||
with pytest.raises(
|
||||
requests.ConnectionError, match="Remote end closed connection without response"
|
||||
):
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
|
||||
assert r.status_code == 500
|
||||
|
||||
# Test that subsequent requests after the panic still work.
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
|
||||
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
|
||||
m = parse_metrics(r.text)
|
||||
m.query_one("lfc_hits")
|
||||
m.query_one("lfc_misses")
|
||||
@@ -197,7 +197,7 @@ def test_create_snapshot(
|
||||
shutil.copytree(
|
||||
test_output_dir,
|
||||
new_compatibility_snapshot_dir,
|
||||
ignore=shutil.ignore_patterns("pg_dynshmem"),
|
||||
ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
|
||||
)
|
||||
|
||||
log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
|
||||
|
||||
47
test_runner/regress/test_hcc_handling_ps_data_loss.py
Normal file
47
test_runner/regress/test_hcc_handling_ps_data_loss.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import shutil
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def test_hcc_handling_ps_data_loss(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test that following a pageserver local data loss event, the system can recover automatically (i.e.
|
||||
rehydrating the restarted pageserver from remote storage) without manual intervention. The
|
||||
pageserver indicates to the storage controller that it has restarted without any local tenant
|
||||
data in its "reattach" request and the storage controller uses this information to detect the
|
||||
data loss condition and reconfigure the pageserver as necessary.
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.start()
|
||||
env.storage_controller.start(handle_ps_local_disk_loss=True)
|
||||
env.pageserver.start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
|
||||
# create new nenant
|
||||
tenant_id, _ = env.create_tenant(shard_count=4)
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
|
||||
cur.execute("CREATE DATABASE testdb")
|
||||
|
||||
with endpoint.cursor(dbname="testdb") as cur:
|
||||
cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
|
||||
endpoint.stop()
|
||||
|
||||
# Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
|
||||
# that restarted with the same ID but has lost all its local disk data.
|
||||
env.pageserver.stop(immediate=True)
|
||||
shutil.rmtree(env.pageserver.tenant_dir())
|
||||
env.pageserver.start()
|
||||
|
||||
# Test that the endpoint can start and query the database after the pageserver restarts. This
|
||||
# indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
|
||||
# storage automatically.
|
||||
endpoint.start()
|
||||
with endpoint.cursor(dbname="testdb") as cur:
|
||||
assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100
|
||||
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.utils import USE_LFC, query_scalar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
|
||||
# verify working set size after some index access of a few select pages only
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(false)")
|
||||
log.info(f"working set size after some index access of a few select pages only {blocks}")
|
||||
assert blocks < 20
|
||||
|
||||
# Also test the metrics from the /autoscaling_metrics endpoint
|
||||
autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
|
||||
log.debug(f"Raw metrics: {autoscaling_metrics}")
|
||||
m = parse_metrics(autoscaling_metrics)
|
||||
|
||||
http_estimate = m.query_one(
|
||||
"lfc_approximate_working_set_size_windows",
|
||||
{
|
||||
"duration_seconds": "60",
|
||||
},
|
||||
).value
|
||||
log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
|
||||
assert http_estimate > 0 and http_estimate < 20
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
|
||||
|
||||
@@ -3,11 +3,22 @@ from __future__ import annotations
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import StorageControllerApiException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
||||
# Fix the code so it handles the migration better.
|
||||
ALLOWED_PAGESERVER_ERRORS = [
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
|
||||
|
||||
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
"timeline_safekeeper_count": 1,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
||||
# Fix the code so it handles the migration better.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
|
||||
@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
assert len(mconf["sk_set"]) == 1
|
||||
assert mconf["generation"] == 1
|
||||
|
||||
current_sk = mconf["sk_set"][0]
|
||||
|
||||
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
|
||||
expected_gen = 1
|
||||
|
||||
for active_sk in range(1, 4):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [active_sk]
|
||||
)
|
||||
|
||||
if active_sk != current_sk:
|
||||
expected_gen += 2
|
||||
current_sk = active_sk
|
||||
|
||||
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
|
||||
|
||||
for sk in other_sks:
|
||||
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
||||
|
||||
# 1 initial generation + 2 migrations on each loop iteration.
|
||||
expected_gen = 1 + 2 * 3
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == expected_gen
|
||||
|
||||
@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
|
||||
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
|
||||
|
||||
expect_fail([sk_set[0], decom_sk], "decomissioned")
|
||||
|
||||
|
||||
def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that safekeeper migration handles failures well.
|
||||
|
||||
Two main conditions are checked:
|
||||
1. safekeeper migration handler can be retried on different failures.
|
||||
2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 4
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 3,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert len(mconf["sk_set"]) == 3
|
||||
assert mconf["generation"] == 1
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
|
||||
excluded_sk = mconf["sk_set"][-1]
|
||||
added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
|
||||
new_sk_set = mconf["sk_set"][:-1] + [added_sk]
|
||||
log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
|
||||
|
||||
failpoints = [
|
||||
"sk-migration-after-step-3",
|
||||
"sk-migration-after-step-4",
|
||||
"sk-migration-after-step-5",
|
||||
"sk-migration-after-step-7",
|
||||
"sk-migration-after-step-8",
|
||||
"sk-migration-step-9-after-set-membership",
|
||||
"sk-migration-step-9-mid-exclude",
|
||||
"sk-migration-step-9-after-exclude",
|
||||
"sk-migration-after-step-9",
|
||||
]
|
||||
|
||||
for i, fp in enumerate(failpoints):
|
||||
env.storage_controller.configure_failpoints((fp, "return(1)"))
|
||||
|
||||
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, new_sk_set
|
||||
)
|
||||
ep.safe_psql(f"INSERT INTO t VALUES ({i})")
|
||||
|
||||
env.storage_controller.configure_failpoints((fp, "off"))
|
||||
|
||||
# No failpoints, migration should succeed.
|
||||
env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["new_sk_set"] is None
|
||||
assert mconf["sk_set"] == new_sk_set
|
||||
assert mconf["generation"] == 3
|
||||
|
||||
ep.clear_buffers()
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
|
||||
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
|
||||
|
||||
# Check that we didn't forget to remove the timeline on the excluded safekeeper.
|
||||
with pytest.raises(requests.exceptions.HTTPError) as exc:
|
||||
env.safekeepers[excluded_sk - 1].http_client().timeline_status(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
assert exc.value.response.status_code == 404
|
||||
assert (
|
||||
f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user