Merge remote-tracking branch 'origin/main' into communicator-rewrite

This commit is contained in:
Heikki Linnakangas
2025-07-05 16:59:51 +03:00
141 changed files with 5475 additions and 2033 deletions

View File

@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
self.auth = BearerAuth(jwt)
self.mount("http://", HTTPAdapter())
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
def dbs_and_roles(self):
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
return res.json()
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
res = self.get(self.prewarm_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self, from_endpoint_id: str | None = None):
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(url, params=params).raise_for_status()
self.post(self.prewarm_url, params=params).raise_for_status()
self.prewarm_lfc_wait()
def prewarm_lfc_wait(self):
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(prewarmed, timeout=60)
def offload_lfc(self):
url = f"http://localhost:{self.external_port}/lfc/offload"
self.post(url).raise_for_status()
def offload_lfc_status(self) -> dict[str, str]:
res = self.get(self.offload_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def offload_lfc(self):
self.post(self.offload_url).raise_for_status()
self.offload_lfc_wait()
def offload_lfc_wait(self):
def offloaded():
res = self.get(url)
res.raise_for_status()
json = res.json()
json = self.offload_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(offloaded)

View File

@@ -567,6 +567,8 @@ class NeonLocalCli(AbstractNeonCli):
basebackup_request_tries: int | None = None,
timeout: str | None = None,
env: dict[str, str] | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -592,6 +594,10 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--create-test-user"])
if timeout is not None:
args.extend(["--start-timeout", str(timeout)])
if autoprewarm:
args.extend(["--autoprewarm"])
if offload_lfc_interval_seconds is not None:
args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
res = self.raw_cli(args, extra_env_vars)
res.check_returncode()

View File

@@ -724,15 +724,21 @@ class NeonEnvBuilder:
shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
assert not (storcon_db_to_dir / "postgres.log").exists()
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
# However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
# will currently reject re-attach requests from them because the NodeMetadata isn't identical.
# However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
# controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
# So, from_repo_dir patches up the the storcon database.
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
assert not patch_script_path.exists()
patch_script = ""
for ps in self.env.pageservers:
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';\n"
for sk in self.env.safekeepers:
patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n"
patch_script_path.write_text(patch_script)
# Update the config with info about tenants and timelines
@@ -1861,6 +1867,7 @@ class PageserverSchedulingPolicy(StrEnum):
FILLING = "Filling"
PAUSE = "Pause"
PAUSE_FOR_RESTART = "PauseForRestart"
DELETING = "Deleting"
class StorageControllerLeadershipStatus(StrEnum):
@@ -2069,14 +2076,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
def node_delete_old(self, node_id):
log.info(f"node_delete_old({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def cancel_node_delete(self, node_id):
log.info(f"cancel_node_delete({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def tombstone_delete(self, node_id):
log.info(f"tombstone_delete({node_id})")
self.request(
@@ -4350,6 +4373,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries: int | None = None,
timeout: str | None = None,
env: dict[str, str] | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Start the Postgres instance.
@@ -4374,6 +4399,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries=basebackup_request_tries,
timeout=timeout,
env=env,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
self._running.release(1)
self.log_config_value("shared_buffers")
@@ -4589,6 +4616,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id: int | None = None,
allow_multiple: bool = False,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Create an endpoint, apply config, and start Postgres.
@@ -4609,6 +4638,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
return self
@@ -4693,6 +4724,8 @@ class EndpointFactory:
remote_ext_base_url: str | None = None,
pageserver_id: int | None = None,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4714,6 +4747,8 @@ class EndpointFactory:
remote_ext_base_url=remote_ext_base_url,
pageserver_id=pageserver_id,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
def create(

View File

@@ -111,6 +111,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*stalling layer flushes for compaction backpressure.*",
".*layer roll waiting for flush due to compaction backpressure.*",
".*BatchSpanProcessor.*",
".*No broker updates received for a while.*",
*(
[
r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"

View File

@@ -416,6 +416,8 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
# timeline creation (uploads). mask it out here to avoid flakyness.
del success_result["remote_consistent_lsn_visible"]
del repeat_result["remote_consistent_lsn_visible"]
del success_result["walreceiver_status"]
del repeat_result["walreceiver_status"]
assert repeat_result == success_result
finally:
env.pageserver.stop(immediate=True)

View File

@@ -76,6 +76,7 @@ if TYPE_CHECKING:
# export CHECK_ONDISK_DATA_COMPATIBILITY=true
# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
#
# # Build previous version of binaries and store them somewhere:
# rm -rf pg_install target
@@ -102,6 +103,7 @@ if TYPE_CHECKING:
# export CHECK_ONDISK_DATA_COMPATIBILITY=true
# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
# export NEON_BIN=target/${BUILD_TYPE}
# export POSTGRES_DISTRIB_DIR=pg_install
#

View File

@@ -1,34 +1,38 @@
import random
import threading
import time
from enum import Enum
from enum import StrEnum
from time import sleep
from typing import Any
import pytest
from fixtures.endpoint.http import EndpointHttpClient
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import USE_LFC
from fixtures.utils import USE_LFC, wait_until
from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
from psycopg2.extensions import cursor as Cursor
class LfcQueryMethod(Enum):
COMPUTE_CTL = False
POSTGRES = True
class PrewarmMethod(StrEnum):
POSTGRES = "postgres"
COMPUTE_CTL = "compute-ctl"
AUTOPREWARM = "autoprewarm"
PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
METHOD_VALUES = [e for e in PrewarmMethod]
METHOD_IDS = [e.value for e in PrewarmMethod]
def check_pinned_entries(cur):
def check_pinned_entries(cur: Cursor):
# some LFC buffer can be temporary locked by autovacuum or background writer
for _ in range(10):
cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
n_pinned = cur.fetchall()[0][0]
if n_pinned == 0:
break
time.sleep(1)
sleep(1)
assert n_pinned == 0
@@ -41,21 +45,68 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
}
def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
if method == PrewarmMethod.AUTOPREWARM:
client.offload_lfc_wait()
elif method == PrewarmMethod.COMPUTE_CTL:
status = client.prewarm_lfc_status()
assert status["status"] == "not_prewarmed"
assert "error" not in status
client.offload_lfc()
assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
assert prom_parse(client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
elif method == PrewarmMethod.POSTGRES:
cur.execute("select get_local_cache_state()")
return cur.fetchall()[0][0]
else:
raise AssertionError(f"{method} not in PrewarmMethod")
def prewarm_endpoint(
method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
):
if method == PrewarmMethod.AUTOPREWARM:
client.prewarm_lfc_wait()
elif method == PrewarmMethod.COMPUTE_CTL:
client.prewarm_lfc()
elif method == PrewarmMethod.POSTGRES:
cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
def check_prewarmed(
method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
):
if method == PrewarmMethod.AUTOPREWARM:
assert client.prewarm_lfc_status() == desired_status
assert prom_parse(client)[PREWARM_LABEL] == 1
elif method == PrewarmMethod.COMPUTE_CTL:
assert client.prewarm_lfc_status() == desired_status
assert prom_parse(client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
env = neon_simple_env
n_records = 1000000
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=[
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000",
],
)
cfg = [
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000",
]
offload_secs = 2
if method == PrewarmMethod.AUTOPREWARM:
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=cfg,
autoprewarm=True,
offload_lfc_interval_seconds=offload_secs,
)
else:
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
@@ -69,31 +120,21 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
log.info(f"Inserted {n_records} rows")
http_client = endpoint.http_client()
if query is LfcQueryMethod.COMPUTE_CTL:
status = http_client.prewarm_lfc_status()
assert status["status"] == "not_prewarmed"
assert "error" not in status
http_client.offload_lfc()
assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
else:
pg_cur.execute("select get_local_cache_state()")
lfc_state = pg_cur.fetchall()[0][0]
client = endpoint.http_client()
lfc_state = offload_lfc(method, client, pg_cur)
endpoint.stop()
endpoint.start()
if method == PrewarmMethod.AUTOPREWARM:
endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
else:
endpoint.start()
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
lfc_conn = endpoint.connect(dbname="lfc")
lfc_cur = lfc_conn.cursor()
if query is LfcQueryMethod.COMPUTE_CTL:
http_client.prewarm_lfc()
else:
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
prewarm_endpoint(method, client, pg_cur, lfc_state)
pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
lfc_used_pages = pg_cur.fetchall()[0][0]
@@ -111,33 +152,32 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
and prewarm_info[1] > 0
and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
)
lfc_cur.execute("select sum(pk) from t")
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
check_pinned_entries(pg_cur)
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
if query is LfcQueryMethod.COMPUTE_CTL:
assert http_client.prewarm_lfc_status() == desired
assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
check_prewarmed(method, client, desired)
# autoprewarm isn't needed as we prewarm manually
WORKLOAD_VALUES = METHOD_VALUES[:-1]
WORKLOAD_IDS = METHOD_IDS[:-1]
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
env = neon_simple_env
n_records = 10000
n_threads = 4
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=[
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000000",
],
)
cfg = [
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000000",
]
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
@@ -154,12 +194,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
log.info(f"Inserted {n_records} rows")
http_client = endpoint.http_client()
if query is LfcQueryMethod.COMPUTE_CTL:
http_client.offload_lfc()
else:
pg_cur.execute("select get_local_cache_state()")
lfc_state = pg_cur.fetchall()[0][0]
lfc_state = offload_lfc(method, http_client, pg_cur)
running = True
n_prewarms = 0
@@ -170,8 +205,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
while running:
src = random.randint(1, n_records)
dst = random.randint(1, n_records)
lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
n_transfers += 1
log.info(f"Number of transfers: {n_transfers}")
@@ -183,13 +218,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
pg_cur.execute("select pg_reload_conf()")
pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
pg_cur.execute("select pg_reload_conf()")
if query is LfcQueryMethod.COMPUTE_CTL:
# Same thing as prewarm_lfc(), testing other method
http_client.prewarm_lfc(endpoint.endpoint_id)
else:
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
prewarm_endpoint(method, http_client, pg_cur, lfc_state)
nonlocal n_prewarms
n_prewarms += 1
log.info(f"Number of prewarms: {n_prewarms}")
@@ -203,7 +232,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
prewarm_thread = threading.Thread(target=prewarm)
prewarm_thread.start()
time.sleep(20)
def prewarmed():
assert n_prewarms > 5
wait_until(prewarmed)
running = False
for t in workload_threads:
@@ -215,5 +247,5 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
assert total_balance == 0
check_pinned_entries(pg_cur)
if query is LfcQueryMethod.COMPUTE_CTL:
if method != PrewarmMethod.POSTGRES:
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}

View File

@@ -180,7 +180,7 @@ def test_metric_collection(
httpserver.check()
# Check that at least one bucket output object is present, and that all
# can be decompressed and decoded.
# can be decompressed and decoded as NDJSON.
bucket_dumps = {}
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
@@ -188,7 +188,13 @@ def test_metric_collection(
file_path = os.path.join(dirpath, file)
log.info(file_path)
if file.endswith(".gz"):
bucket_dumps[file_path] = json.load(gzip.open(file_path))
events = []
with gzip.open(file_path, "rt") as f:
for line in f:
line = line.strip()
if line:
events.append(json.loads(line))
bucket_dumps[file_path] = {"events": events}
assert len(bucket_dumps) >= 1
assert all("events" in data for data in bucket_dumps.values())

View File

@@ -989,6 +989,102 @@ def test_storage_controller_compute_hook_retry(
)
@run_only_on_default_postgres("postgres behavior is not relevant")
def test_storage_controller_compute_hook_keep_failing(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address: ListenAddress,
):
neon_env_builder.num_pageservers = 4
neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False}
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
# Set up CP handler for compute notifications
status_by_tenant: dict[TenantId, int] = {}
def handler(request: Request):
notify_request = request.json
assert notify_request is not None
status = status_by_tenant[TenantId(notify_request["tenant_id"])]
log.info(f"Notify request[{status}]: {notify_request}")
return Response(status=status)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Run neon environment
env = neon_env_builder.init_configs()
env.start()
# Create two tenants:
# - The first tenant is banned by CP and contains only one shard
# - The second tenant is allowed by CP and contains four shards
banned_tenant = TenantId.generate()
status_by_tenant[banned_tenant] = 200 # we will ban this tenant later
env.create_tenant(banned_tenant, placement_policy='{"Attached": 1}')
shard_count = 4
allowed_tenant = TenantId.generate()
status_by_tenant[allowed_tenant] = 200
env.create_tenant(allowed_tenant, shard_count=shard_count, placement_policy='{"Attached": 1}')
# Find the pageserver of the banned tenant
banned_tenant_ps = env.get_tenant_pageserver(banned_tenant)
assert banned_tenant_ps is not None
alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
# Stop pageserver and ban tenant to trigger failed reconciliation
status_by_tenant[banned_tenant] = 423
banned_tenant_ps.stop()
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
# Migrate all allowed tenant shards to the first alive pageserver
# to trigger storage controller optimizations due to affinity rules
for shard_number in range(shard_count):
env.storage_controller.tenant_shard_migrate(
TenantShardId(allowed_tenant, shard_number, shard_count),
alive_pageservers[0].id,
config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
)
# Make some reconcile_all calls to trigger optimizations
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
RECONCILE_COUNT = 12
for i in range(RECONCILE_COUNT):
try:
n = env.storage_controller.reconcile_all()
log.info(f"Reconciliation attempt {i} finished with success: {n}")
except StorageControllerApiException as e:
assert "Control plane tenant busy" in str(e)
log.info(f"Reconciliation attempt {i} finished with failure")
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
time.sleep(2)
# Check that the allowed tenant shards are optimized due to affinity rules
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
not_optimized_shard_count = 0
for loc in locations:
tsi = TenantShardId.parse(loc[0])
if tsi.tenant_id != allowed_tenant:
continue
if loc[1]["mode"] == "AttachedSingle":
not_optimized_shard_count += 1
log.info(f"Shard {tsi} seen in mode {loc[1]['mode']}")
assert not_optimized_shard_count < shard_count, "At least one shard should be optimized"
# Unban the tenant and run reconciliations
status_by_tenant[banned_tenant] = 200
env.storage_controller.reconcile_all()
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
assert banned_descr["shards"][0]["is_pending_compute_notification"] is False
@run_only_on_default_postgres("this test doesn't start an endpoint")
def test_storage_controller_compute_hook_revert(
httpserver: HTTPServer,
@@ -2522,7 +2618,7 @@ def test_storage_controller_node_deletion(
wait_until(assert_shards_migrated)
log.info(f"Deleting pageserver {victim.id}")
env.storage_controller.node_delete(victim.id)
env.storage_controller.node_delete_old(victim.id)
if not while_offline:
@@ -2557,6 +2653,60 @@ def test_storage_controller_node_deletion(
env.storage_controller.consistency_check()
def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 3
neon_env_builder.num_azs = 3
env = neon_env_builder.init_configs()
env.start()
tenant_count = 12
shard_count_per_tenant = 16
tenant_ids = []
for _ in range(0, tenant_count):
tid = TenantId.generate()
tenant_ids.append(tid)
env.create_tenant(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
# Sanity check: initial creations should not leave the system in an unstable scheduling state
assert env.storage_controller.reconcile_all() == 0
nodes = env.storage_controller.node_list()
assert len(nodes) == 3
env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
ps_id_to_delete = env.pageservers[0].id
env.storage_controller.warm_up_all_secondaries()
env.storage_controller.retryable_node_operation(
lambda ps_id: env.storage_controller.node_delete(ps_id),
ps_id_to_delete,
max_attempts=3,
backoff=2,
)
env.storage_controller.poll_node_status(
ps_id_to_delete,
PageserverAvailability.ACTIVE,
PageserverSchedulingPolicy.DELETING,
max_attempts=6,
backoff=2,
)
env.storage_controller.cancel_node_delete(ps_id_to_delete)
env.storage_controller.poll_node_status(
ps_id_to_delete,
PageserverAvailability.ACTIVE,
PageserverSchedulingPolicy.ACTIVE,
max_attempts=6,
backoff=2,
)
@pytest.mark.parametrize("shard_count", [None, 2])
def test_storage_controller_metadata_health(
neon_env_builder: NeonEnvBuilder,
@@ -3112,7 +3262,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
assert_nodes_count(3)
ps = env.pageservers[0]
env.storage_controller.node_delete(ps.id)
env.storage_controller.node_delete_old(ps.id)
# After deletion, the node count must be reduced
assert_nodes_count(2)
@@ -3530,18 +3680,21 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
# some small tests for the scheduling policy querying and returning APIs
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
# change back to paused again
assert (
newest_info["scheduling_policy"] == "Activating"
or newest_info["scheduling_policy"] == "Active"
)
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
# change back to active again
target.safekeeper_scheduling_policy(inserted["id"], "Active")
def storcon_heartbeat():
assert env.storage_controller.log_contains(
@@ -3554,6 +3707,57 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
def test_safekeeper_activating_to_active(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_configs()
env.start()
fake_id = 5
target = env.storage_controller
assert target.get_safekeeper(fake_id) is None
start_sks = target.get_safekeepers()
sk_0 = env.safekeepers[0]
body = {
"active": True,
"id": fake_id,
"created_at": "2023-10-25T09:11:25Z",
"updated_at": "2024-08-28T11:32:43Z",
"region_id": "aws-eu-central-1",
"host": "localhost",
"port": sk_0.port.pg,
"http_port": sk_0.port.http,
"https_port": None,
"version": 5957,
"availability_zone_id": "eu-central-1a",
}
target.on_safekeeper_deploy(fake_id, body)
inserted = target.get_safekeeper(fake_id)
assert inserted is not None
assert target.get_safekeepers() == start_sks + [inserted]
assert eq_safekeeper_records(body, inserted)
def safekeeper_is_active():
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
wait_until(safekeeper_is_active)
target.safekeeper_scheduling_policy(inserted["id"], "Activating")
wait_until(safekeeper_is_active)
# Now decomission it
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
compared = [dict(a), dict(b)]

View File

@@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
# it is to be in line with the deletion timestamp.. well, almost.
when = original_ancestor[2][:26]
when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC)
now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC)
now = datetime.datetime.now(datetime.UTC)
assert when_ts < now
assert len(lineage.get("reparenting_history", [])) == 0
elif expected_ancestor == timeline_id:
@@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
env.pageserver.quiesce_tenants()
# checking the ancestor after is much faster than waiting for the endpoint not start
# checking the ancestor after is much faster than waiting for the endpoint to start
expected_result = [
("main", env.initial_timeline, None, 24576, 1),
("after", after, env.initial_timeline, 24576, 1),
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1),
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1),
("branch_to_detach", branch_to_detach, None, 16384, 1),
("earlier", earlier, env.initial_timeline, 0, 1),
# (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only)
("main", env.initial_timeline, None, 24576, 1, False),
("after", after, env.initial_timeline, 24576, 1, False),
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True),
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False),
("branch_to_detach", branch_to_detach, None, 16384, 1, False),
("earlier", earlier, env.initial_timeline, 0, 1, False),
]
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result:
for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result:
details = client.timeline_detail(env.initial_tenant, queried_timeline)
ancestor_timeline_id = details["ancestor_timeline_id"]
if expected_ancestor is None:
@@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
assert len(lineage.get("original_ancestor", [])) == 0
assert len(lineage.get("reparenting_history", [])) == 0
for branch_name, queried_timeline, _, rows, starts in expected_result:
details = client.timeline_detail(env.initial_tenant, queried_timeline)
log.info(f"reading data from branch {branch_name}")
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
for branch_name, queried_timeline, _, rows, starts, read_only in expected_result:
last_record_lsn = None
if read_only:
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
details = client.timeline_detail(env.initial_tenant, queried_timeline)
last_record_lsn = Lsn(details["last_record_lsn"])
log.info(f"reading data from branch {branch_name} at {last_record_lsn}")
with env.endpoints.create(
branch_name,
lsn=Lsn(details["last_record_lsn"]),
lsn=last_record_lsn,
) as ep:
ep.start(safekeeper_generation=1)
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -1884,6 +1889,31 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
def test_detach_ancestors_with_no_writes(
neon_env_builder: NeonEnvBuilder,
):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.safe_psql(
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
)
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.stop()
for i in range(0, 5):
if i == 0:
ancestor_name = "main"
else:
ancestor_name = f"b{i}"
tlid = env.create_branch(f"b{i + 1}", ancestor_branch_name=ancestor_name)
client = env.pageserver.http_client()
client.detach_ancestor(tenant_id=env.initial_tenant, timeline_id=tlid)
# TODO:
# - branch near existing L1 boundary, image layers?
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.

View File

@@ -2740,3 +2740,85 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
raise Exception("Uneviction did not happen on source safekeeper yet")
wait_until(unevicted)
def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
"""
Test that the timeline disk usage circuit breaker works as expected. We test that:
1. The circuit breaker kicks in when the timeline's disk usage exceeds the configured limit,
and it causes writes to hang.
2. The hanging writes unblock when the issue resolves (by restarting the safekeeper in the
test to simulate a more realistic production troubleshooting scenario).
3. We can continue to write as normal after the issue resolves.
4. There is no data corruption throughout the test.
"""
# Set up environment with a very small disk usage limit (1KB)
neon_env_builder.num_safekeepers = 1
remote_storage_kind = s3_storage()
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
# Set a very small disk usage limit (1KB)
neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
env = neon_env_builder.init_start()
# Create a timeline and endpoint
env.create_branch("test_timeline_disk_usage_limit")
endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
# Get the safekeeper
sk = env.safekeepers[0]
# Inject a failpoint to stop WAL backup
with sk.http_client() as http_cli:
http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
# Write some data that will exceed the 1KB limit. While the failpoint is active, this operation
# will hang as Postgres encounters safekeeper-returned errors and retries.
def run_hanging_insert():
with closing(endpoint.connect()) as bg_conn:
with bg_conn.cursor() as bg_cur:
# This should generate more than 1KB of WAL
bg_cur.execute("create table t(key int, value text)")
bg_cur.execute("insert into t select generate_series(1,2000), 'payload'")
# Start the inserts in a background thread
bg_thread = threading.Thread(target=run_hanging_insert)
bg_thread.start()
# Wait for the error message to appear in the compute log
def error_logged():
return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
wait_until(error_logged)
log.info("Found expected error message in compute log, resuming.")
# Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
# implemented didn't work as expected.
time.sleep(2)
assert bg_thread.is_alive(), (
"The hanging insert somehow unblocked without resolving the disk usage issue!"
)
log.info("Restarting the safekeeper to resume WAL backup.")
# Restart the safekeeper with defaults to both clear the failpoint and resume the larger disk usage limit.
for sk in env.safekeepers:
sk.stop().start(extra_opts=[])
# The hanging insert will now complete. Join the background thread so that we can
# verify that the insert completed successfully.
bg_thread.join(timeout=120)
assert not bg_thread.is_alive(), "Hanging insert did not complete after safekeeper restart"
log.info("Hanging insert unblocked.")
# Verify we can continue to write as normal
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("insert into t select generate_series(2001,3000), 'payload'")
# Sanity check data correctness
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from t")
# 2000 rows from first insert + 1000 from last insert
assert cur.fetchone() == (3000,)

View File

@@ -13,50 +13,6 @@ if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# we assert below that the walreceiver is not active before data writes.
# with manually created timelines, it is active.
# FIXME: remove this test once we remove timelines_onto_safekeepers
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
env = neon_env_builder.init_start()
env.pageserver.http_client()
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
tenant_id, timeline_id = env.create_tenant()
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
assert "WalReceiver status: Not active" in exception_string, (
"Walreceiver should not be active before any data writes"
)
insert_test_elements(env, tenant_id, start=0, count=1_000)
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
assert "WalReceiver status: Not active" not in exception_string, (
"Should not be inactive anymore after INSERTs are made"
)
assert "WalReceiver status" in exception_string, "But still should have some other status"
# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):

View File

@@ -0,0 +1,21 @@
DO $$
DECLARE
i numeric;
BEGIN
create role somebody;
FOR i IN 1..1000000 LOOP
BEGIN
IF i % 1000 = 0 THEN
alter role somebody password 'welcome';
ELSE
PERFORM 1;
END IF;
EXCEPTION WHEN OTHERS THEN
RAISE WARNING 'error';
END;
IF I = 1000000 THEN
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
END IF;
END LOOP;
END;
$$;

View File

@@ -10,3 +10,4 @@ test: neon-clog
test: neon-test-utils
test: neon-vacuum-full
test: neon-event-triggers
test: neon-subxacts

View File

@@ -0,0 +1,21 @@
DO $$
DECLARE
i numeric;
BEGIN
create role somebody;
FOR i IN 1..1000000 LOOP
BEGIN
IF i % 1000 = 0 THEN
alter role somebody password 'welcome';
ELSE
PERFORM 1;
END IF;
EXCEPTION WHEN OTHERS THEN
RAISE WARNING 'error';
END;
IF I = 1000000 THEN
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
END IF;
END LOOP;
END;
$$;