Merge remote-tracking branch 'origin/main' into problame/standby-horizon-removal-poc-rip-out

This commit is contained in:
Christian Schwarz
2025-07-09 11:10:53 +00:00
193 changed files with 8745 additions and 3023 deletions

View File

@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
self.auth = BearerAuth(jwt)
self.mount("http://", HTTPAdapter())
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
def dbs_and_roles(self):
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
return res.json()
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
res = self.get(self.prewarm_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self, from_endpoint_id: str | None = None):
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(url, params=params).raise_for_status()
self.post(self.prewarm_url, params=params).raise_for_status()
self.prewarm_lfc_wait()
def prewarm_lfc_wait(self):
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(prewarmed, timeout=60)
def offload_lfc(self):
url = f"http://localhost:{self.external_port}/lfc/offload"
self.post(url).raise_for_status()
def offload_lfc_status(self) -> dict[str, str]:
res = self.get(self.offload_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def offload_lfc(self):
self.post(self.offload_url).raise_for_status()
self.offload_lfc_wait()
def offload_lfc_wait(self):
def offloaded():
res = self.get(url)
res.raise_for_status()
json = res.json()
json = self.offload_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(offloaded)

View File

@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
)
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
# BEGIN_HADRON
"pageserver_active_storage_operations_count",
# END_HADRON
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",

View File

@@ -568,6 +568,8 @@ class NeonLocalCli(AbstractNeonCli):
timeout: str | None = None,
env: dict[str, str] | None = None,
dev: bool = False,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -593,6 +595,10 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--create-test-user"])
if timeout is not None:
args.extend(["--start-timeout", str(timeout)])
if autoprewarm:
args.extend(["--autoprewarm"])
if offload_lfc_interval_seconds is not None:
args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
if dev:
args.extend(["--dev"])

View File

@@ -1875,6 +1875,7 @@ class PageserverSchedulingPolicy(StrEnum):
FILLING = "Filling"
PAUSE = "Pause"
PAUSE_FOR_RESTART = "PauseForRestart"
DELETING = "Deleting"
class StorageControllerLeadershipStatus(StrEnum):
@@ -2083,14 +2084,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
def node_delete_old(self, node_id):
log.info(f"node_delete_old({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def cancel_node_delete(self, node_id):
log.info(f"cancel_node_delete({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def tombstone_delete(self, node_id):
log.info(f"tombstone_delete({node_id})")
self.request(
@@ -4353,6 +4370,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries: int | None = None,
timeout: str | None = None,
env: dict[str, str] | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Start the Postgres instance.
@@ -4377,6 +4396,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries=basebackup_request_tries,
timeout=timeout,
env=env,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
self._running.release(1)
self.log_config_value("shared_buffers")
@@ -4592,6 +4613,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id: int | None = None,
allow_multiple: bool = False,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Create an endpoint, apply config, and start Postgres.
@@ -4612,6 +4635,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
return self
@@ -4696,6 +4721,8 @@ class EndpointFactory:
remote_ext_base_url: str | None = None,
pageserver_id: int | None = None,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4717,6 +4744,8 @@ class EndpointFactory:
remote_ext_base_url=remote_ext_base_url,
pageserver_id=pageserver_id,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
def create(

View File

@@ -111,6 +111,15 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*stalling layer flushes for compaction backpressure.*",
".*layer roll waiting for flush due to compaction backpressure.*",
".*BatchSpanProcessor.*",
# Can happen in tests that purposely wipe pageserver "local disk" data.
".*Local data loss suspected.*",
# Too many frozen layers error is normal during intensive benchmarks
".*too many frozen layers.*",
# Transient errors when resolving tenant shards by page service
".*Fail to resolve tenant shard in attempt.*",
# Expected warnings when pageserver has not refreshed GC info yet
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
".*No broker updates received for a while.*",
*(
[
r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"

View File

@@ -112,12 +112,18 @@ class TimelineCreateRequest:
class TimelineMembershipSwitchResponse:
previous_conf: MembershipConfiguration
current_conf: MembershipConfiguration
last_log_term: int
flush_lsn: Lsn
@classmethod
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
current_conf = MembershipConfiguration.from_json(d["current_conf"])
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
last_log_term = d["last_log_term"]
flush_lsn = Lsn(d["flush_lsn"])
return TimelineMembershipSwitchResponse(
previous_conf, current_conf, last_log_term, flush_lsn
)
class SafekeeperHttpClient(requests.Session, MetricsGetter):

View File

@@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants(
@pytest.mark.parametrize("duration", [20 * 60])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
# which by default uses 64 connections
@pytest.mark.parametrize("n_clients", [1, 64])
# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type,
# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered
# a good utilization for pageserver.
@pytest.mark.parametrize("n_clients", [1, 8])
@pytest.mark.parametrize("n_tenants", [1])
@pytest.mark.timeout(2400)
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
@@ -70,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man
n_clients: int,
):
setup_and_run_pagebench_benchmark(
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
neon_env_builder,
zenbenchmark,
pg_bin,
n_tenants,
pgbench_scale,
duration,
n_clients,
)
@@ -85,7 +92,8 @@ def setup_and_run_pagebench_benchmark(
):
def record(metric, **kwargs):
zenbenchmark.record(
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}",
**kwargs,
)
params: dict[str, tuple[Any, dict[str, Any]]] = {}
@@ -103,9 +111,7 @@ def setup_and_run_pagebench_benchmark(
# configure cache sizes like in prod
page_cache_size = 16384
max_file_descriptors = 500000
neon_env_builder.pageserver_config_override = (
f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
)
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}"
tracing_config = PageserverTracingConfig(
sampling_ratio=(0, 1000),
@@ -121,7 +127,10 @@ def setup_and_run_pagebench_benchmark(
page_cache_size * 8192,
{"unit": "byte"},
),
"pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
"pageserver_config_override.max_file_descriptors": (
max_file_descriptors,
{"unit": ""},
),
"pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
}
)

View File

@@ -416,6 +416,8 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
# timeline creation (uploads). mask it out here to avoid flakyness.
del success_result["remote_consistent_lsn_visible"]
del repeat_result["remote_consistent_lsn_visible"]
del success_result["walreceiver_status"]
del repeat_result["walreceiver_status"]
assert repeat_result == success_result
finally:
env.pageserver.stop(immediate=True)

View File

@@ -1,61 +1,122 @@
import random
import threading
import time
from enum import Enum
from enum import StrEnum
from typing import Any
import pytest
from fixtures.endpoint.http import EndpointHttpClient
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import USE_LFC
from fixtures.utils import USE_LFC, wait_until
from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
from psycopg2.extensions import cursor as Cursor
class LfcQueryMethod(Enum):
COMPUTE_CTL = False
POSTGRES = True
class PrewarmMethod(StrEnum):
POSTGRES = "postgres"
COMPUTE_CTL = "compute-ctl"
AUTOPREWARM = "autoprewarm"
PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
PREWARM_ERR_LABEL = "compute_ctl_lfc_prewarm_errors_total"
OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
METHOD_VALUES = [e for e in PrewarmMethod]
METHOD_IDS = [e.value for e in PrewarmMethod]
def check_pinned_entries(cur):
# some LFC buffer can be temporary locked by autovacuum or background writer
for _ in range(10):
def check_pinned_entries(cur: Cursor):
"""
Wait till none of LFC buffers are pinned
"""
def none_pinned():
cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
n_pinned = cur.fetchall()[0][0]
if n_pinned == 0:
break
time.sleep(1)
assert n_pinned == 0
assert cur.fetchall()[0][0] == 0
wait_until(none_pinned)
def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
labels = PREWARM_LABEL, OFFLOAD_LABEL, PREWARM_ERR_LABEL, OFFLOAD_ERR_LABEL
return {
sample.name: sample.value
sample.name: int(sample.value)
for family in prom_parse_impl(client.metrics())
for sample in family.samples
if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL)
if sample.name in labels
}
def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
if method == PrewarmMethod.AUTOPREWARM:
client.offload_lfc_wait()
elif method == PrewarmMethod.COMPUTE_CTL:
status = client.prewarm_lfc_status()
assert status["status"] == "not_prewarmed"
assert "error" not in status
client.offload_lfc()
assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
parsed = prom_parse(client)
desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
assert parsed == desired, f"{parsed=} != {desired=}"
elif method == PrewarmMethod.POSTGRES:
cur.execute("select get_local_cache_state()")
return cur.fetchall()[0][0]
else:
raise AssertionError(f"{method} not in PrewarmMethod")
def prewarm_endpoint(
method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
):
if method == PrewarmMethod.AUTOPREWARM:
client.prewarm_lfc_wait()
elif method == PrewarmMethod.COMPUTE_CTL:
client.prewarm_lfc()
elif method == PrewarmMethod.POSTGRES:
cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
def check_prewarmed(
method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
):
if method == PrewarmMethod.AUTOPREWARM:
assert client.prewarm_lfc_status() == desired_status
assert prom_parse(client)[PREWARM_LABEL] == 1
elif method == PrewarmMethod.COMPUTE_CTL:
assert client.prewarm_lfc_status() == desired_status
desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0}
assert prom_parse(client) == desired
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
"""
Test we can offload endpoint's LFC cache to endpoint storage.
Test we can prewarm endpoint with LFC cache loaded from endpoint storage.
"""
env = neon_simple_env
n_records = 1000000
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=[
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000",
],
)
cfg = [
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000",
]
offload_secs = 2
if method == PrewarmMethod.AUTOPREWARM:
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=cfg,
autoprewarm=True,
offload_lfc_interval_seconds=offload_secs,
)
else:
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
@@ -69,75 +130,64 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
log.info(f"Inserted {n_records} rows")
http_client = endpoint.http_client()
if query is LfcQueryMethod.COMPUTE_CTL:
status = http_client.prewarm_lfc_status()
assert status["status"] == "not_prewarmed"
assert "error" not in status
http_client.offload_lfc()
assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
else:
pg_cur.execute("select get_local_cache_state()")
lfc_state = pg_cur.fetchall()[0][0]
client = endpoint.http_client()
lfc_state = offload_lfc(method, client, pg_cur)
endpoint.stop()
endpoint.start()
if method == PrewarmMethod.AUTOPREWARM:
endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
else:
endpoint.start()
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
lfc_conn = endpoint.connect(dbname="lfc")
lfc_cur = lfc_conn.cursor()
if query is LfcQueryMethod.COMPUTE_CTL:
http_client.prewarm_lfc()
else:
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
prewarm_endpoint(method, client, pg_cur, lfc_state)
pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
lfc_used_pages = pg_cur.fetchall()[0][0]
log.info(f"Used LFC size: {lfc_used_pages}")
pg_cur.execute("select * from get_prewarm_info()")
prewarm_info = pg_cur.fetchall()[0]
log.info(f"Prewarm info: {prewarm_info}")
total, prewarmed, skipped, _ = prewarm_info
total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
progress = (prewarmed + skipped) * 100 // total
log.info(f"Prewarm progress: {progress}%")
assert lfc_used_pages > 10000
assert (
prewarm_info[0] > 0
and prewarm_info[1] > 0
and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
)
assert total > 0
assert prewarmed > 0
assert total == prewarmed + skipped
lfc_cur.execute("select sum(pk) from t")
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
check_pinned_entries(pg_cur)
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
if query is LfcQueryMethod.COMPUTE_CTL:
assert http_client.prewarm_lfc_status() == desired
assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
check_prewarmed(method, client, desired)
# autoprewarm isn't needed as we prewarm manually
WORKLOAD_VALUES = METHOD_VALUES[:-1]
WORKLOAD_IDS = METHOD_IDS[:-1]
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
"""
Test continiously prewarming endpoint when there is a write-heavy workload going in parallel
"""
env = neon_simple_env
n_records = 10000
n_threads = 4
endpoint = env.endpoints.create_start(
branch_name="main",
config_lines=[
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000000",
],
)
cfg = [
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000000",
]
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
@@ -154,12 +204,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
log.info(f"Inserted {n_records} rows")
http_client = endpoint.http_client()
if query is LfcQueryMethod.COMPUTE_CTL:
http_client.offload_lfc()
else:
pg_cur.execute("select get_local_cache_state()")
lfc_state = pg_cur.fetchall()[0][0]
lfc_state = offload_lfc(method, http_client, pg_cur)
running = True
n_prewarms = 0
@@ -170,8 +215,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
while running:
src = random.randint(1, n_records)
dst = random.randint(1, n_records)
lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
n_transfers += 1
log.info(f"Number of transfers: {n_transfers}")
@@ -183,13 +228,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
pg_cur.execute("select pg_reload_conf()")
pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
pg_cur.execute("select pg_reload_conf()")
if query is LfcQueryMethod.COMPUTE_CTL:
# Same thing as prewarm_lfc(), testing other method
http_client.prewarm_lfc(endpoint.endpoint_id)
else:
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
prewarm_endpoint(method, http_client, pg_cur, lfc_state)
nonlocal n_prewarms
n_prewarms += 1
log.info(f"Number of prewarms: {n_prewarms}")
@@ -203,7 +242,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
prewarm_thread = threading.Thread(target=prewarm)
prewarm_thread.start()
time.sleep(20)
def prewarmed():
assert n_prewarms > 5
wait_until(prewarmed)
running = False
for t in workload_threads:
@@ -215,5 +257,12 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
assert total_balance == 0
check_pinned_entries(pg_cur)
if query is LfcQueryMethod.COMPUTE_CTL:
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}
if method == PrewarmMethod.POSTGRES:
return
desired = {
OFFLOAD_LABEL: 1,
PREWARM_LABEL: n_prewarms,
OFFLOAD_ERR_LABEL: 0,
PREWARM_ERR_LABEL: 0,
}
assert prom_parse(http_client) == desired

View File

@@ -180,7 +180,7 @@ def test_metric_collection(
httpserver.check()
# Check that at least one bucket output object is present, and that all
# can be decompressed and decoded.
# can be decompressed and decoded as NDJSON.
bucket_dumps = {}
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
@@ -188,7 +188,13 @@ def test_metric_collection(
file_path = os.path.join(dirpath, file)
log.info(file_path)
if file.endswith(".gz"):
bucket_dumps[file_path] = json.load(gzip.open(file_path))
events = []
with gzip.open(file_path, "rt") as f:
for line in f:
line = line.strip()
if line:
events.append(json.loads(line))
bucket_dumps[file_path] = {"events": events}
assert len(bucket_dumps) >= 1
assert all("events" in data for data in bucket_dumps.values())

View File

@@ -27,6 +27,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
[
".*Timeline .* was cancelled and cannot be used anymore.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was not found in global map.*",
".*wal receiver task finished with an error.*",
]
)

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import os
import random
import threading
import time
from collections import defaultdict
from threading import Event
from typing import TYPE_CHECKING, Any
import pytest
@@ -1505,6 +1508,171 @@ def test_sharding_split_failures(
env.storage_controller.consistency_check()
@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder):
"""
Test backpressure can ignore new shards during tenant split so that if we abort the split,
PG can continue without being blocked.
"""
DBNAME = "regression"
init_shard_count = 4
neon_env_builder.num_pageservers = init_shard_count
stripe_size = 32
env = neon_env_builder.init_start(
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
)
env.storage_controller.allowed_errors.extend(
[
# All split failures log a warning when then enqueue the abort operation
".*Enqueuing background abort.*",
# Tolerate any error lots that mention a failpoint
".*failpoint.*",
]
)
endpoint = env.endpoints.create(
"main",
config_lines=[
"max_replication_write_lag = 1MB",
"databricks.max_wal_mb_per_second = 1",
"neon.max_cluster_size = 10GB",
],
)
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
endpoint.start()
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
write_done = Event()
def write_data(write_done):
while not write_done.is_set():
endpoint.safe_psql(
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
)
log.info("write_data thread exiting")
writer_thread = threading.Thread(target=write_data, args=(write_done,))
writer_thread.start()
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
# split the tenant
with pytest.raises(StorageControllerApiException):
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
write_done.set()
writer_thread.join()
# writing more data to page servers after split is aborted
for _i in range(5000):
endpoint.safe_psql(
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
)
# wait until write lag becomes 0
def check_write_lag_is_zero():
res = endpoint.safe_psql(
"""
SELECT
pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag
FROM neon.backpressure_lsns();
""",
dbname="databricks_system",
log_query=False,
)
log.info(f"received_lsn_lag = {res[0][0]}")
assert res[0][0] == 0
wait_until(check_write_lag_is_zero)
endpoint.stop_and_destroy()
# BEGIN_HADRON
def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
"""
Tests that page service is able to resolve the correct shard during tenant split without causing query errors
"""
DBNAME = "regression"
WORKER_THREADS = 16
ROW_COUNT = 10000
init_shard_count = 4
neon_env_builder.num_pageservers = 1
stripe_size = 16
env = neon_env_builder.init_start(
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
)
env.storage_controller.allowed_errors.extend(
[
# All split failures log a warning when then enqueue the abort operation
".*Enqueuing background abort.*",
# Tolerate any error lots that mention a failpoint
".*failpoint.*",
]
)
endpoint = env.endpoints.create("main")
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
endpoint.start()
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
# generate 10MB of data
endpoint.safe_psql(
f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;"
)
read_done = Event()
def read_data(read_done):
i = 0
while not read_done.is_set() or i < 10:
endpoint.safe_psql(
f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}",
log_query=False,
)
i += 1
log.info(f"read_data thread exiting. Executed {i} queries.")
reader_threads = []
for _i in range(WORKER_THREADS):
reader_thread = threading.Thread(target=read_data, args=(read_done,))
reader_thread.start()
reader_threads.append(reader_thread)
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
# split the tenant
with pytest.raises(StorageControllerApiException):
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
# wait until abort is done
def check_tenant_status():
active_count = 0
for i in range(init_shard_count):
status = env.pageserver.http_client().tenant_status(
TenantShardId(env.initial_tenant, i, init_shard_count)
)
if status["state"]["slug"] == "Active":
active_count += 1
assert active_count == 4
wait_until(check_tenant_status)
read_done.set()
for thread in reader_threads:
thread.join()
endpoint.stop()
# END_HADRON
def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
"""
Check a scenario when one of the shards is much slower than others.

View File

@@ -989,6 +989,105 @@ def test_storage_controller_compute_hook_retry(
)
@run_only_on_default_postgres("postgres behavior is not relevant")
def test_storage_controller_compute_hook_keep_failing(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address: ListenAddress,
):
neon_env_builder.num_pageservers = 4
neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False}
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
# Set up CP handler for compute notifications
status_by_tenant: dict[TenantId, int] = {}
def handler(request: Request):
notify_request = request.json
assert notify_request is not None
status = status_by_tenant[TenantId(notify_request["tenant_id"])]
log.info(f"Notify request[{status}]: {notify_request}")
return Response(status=status)
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
# Run neon environment
env = neon_env_builder.init_configs()
env.start()
# Create two tenants:
# - The first tenant is banned by CP and contains only one shard
# - The second tenant is allowed by CP and contains four shards
banned_tenant = TenantId.generate()
status_by_tenant[banned_tenant] = 200 # we will ban this tenant later
env.create_tenant(banned_tenant, placement_policy='{"Attached": 1}')
shard_count = 4
allowed_tenant = TenantId.generate()
status_by_tenant[allowed_tenant] = 200
env.create_tenant(allowed_tenant, shard_count=shard_count, placement_policy='{"Attached": 1}')
# Find the pageserver of the banned tenant
banned_tenant_ps = env.get_tenant_pageserver(banned_tenant)
assert banned_tenant_ps is not None
alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
# Stop pageserver and ban tenant to trigger failed reconciliation
log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}")
status_by_tenant[banned_tenant] = 423
banned_tenant_ps.stop()
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
# Migrate all allowed tenant shards to the first alive pageserver
# to trigger storage controller optimizations due to affinity rules
for shard_number in range(shard_count):
log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}")
env.storage_controller.tenant_shard_migrate(
TenantShardId(allowed_tenant, shard_number, shard_count),
alive_pageservers[0].id,
config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
)
# Make some reconcile_all calls to trigger optimizations
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
RECONCILE_COUNT = 12
for i in range(RECONCILE_COUNT):
try:
n = env.storage_controller.reconcile_all()
log.info(f"Reconciliation attempt {i} finished with success: {n}")
except StorageControllerApiException as e:
assert "Control plane tenant busy" in str(e)
log.info(f"Reconciliation attempt {i} finished with failure")
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
time.sleep(2)
# Check that the allowed tenant shards are optimized due to affinity rules
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
not_optimized_shard_count = 0
for loc in locations:
tsi = TenantShardId.parse(loc[0])
if tsi.tenant_id != allowed_tenant:
continue
if loc[1]["mode"] == "AttachedSingle":
not_optimized_shard_count += 1
log.info(f"Shard {tsi} seen in mode {loc[1]['mode']}")
assert not_optimized_shard_count < shard_count, "At least one shard should be optimized"
# Unban the tenant and run reconciliations
status_by_tenant[banned_tenant] = 200
env.storage_controller.reconcile_all()
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
assert banned_descr["shards"][0]["is_pending_compute_notification"] is False
@run_only_on_default_postgres("this test doesn't start an endpoint")
def test_storage_controller_compute_hook_revert(
httpserver: HTTPServer,
@@ -2522,7 +2621,7 @@ def test_storage_controller_node_deletion(
wait_until(assert_shards_migrated)
log.info(f"Deleting pageserver {victim.id}")
env.storage_controller.node_delete(victim.id)
env.storage_controller.node_delete_old(victim.id)
if not while_offline:
@@ -2557,6 +2656,60 @@ def test_storage_controller_node_deletion(
env.storage_controller.consistency_check()
def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 3
neon_env_builder.num_azs = 3
env = neon_env_builder.init_configs()
env.start()
tenant_count = 12
shard_count_per_tenant = 16
tenant_ids = []
for _ in range(0, tenant_count):
tid = TenantId.generate()
tenant_ids.append(tid)
env.create_tenant(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
# Sanity check: initial creations should not leave the system in an unstable scheduling state
assert env.storage_controller.reconcile_all() == 0
nodes = env.storage_controller.node_list()
assert len(nodes) == 3
env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
ps_id_to_delete = env.pageservers[0].id
env.storage_controller.warm_up_all_secondaries()
env.storage_controller.retryable_node_operation(
lambda ps_id: env.storage_controller.node_delete(ps_id),
ps_id_to_delete,
max_attempts=3,
backoff=2,
)
env.storage_controller.poll_node_status(
ps_id_to_delete,
PageserverAvailability.ACTIVE,
PageserverSchedulingPolicy.DELETING,
max_attempts=6,
backoff=2,
)
env.storage_controller.cancel_node_delete(ps_id_to_delete)
env.storage_controller.poll_node_status(
ps_id_to_delete,
PageserverAvailability.ACTIVE,
PageserverSchedulingPolicy.ACTIVE,
max_attempts=6,
backoff=2,
)
@pytest.mark.parametrize("shard_count", [None, 2])
def test_storage_controller_metadata_health(
neon_env_builder: NeonEnvBuilder,
@@ -3112,7 +3265,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
assert_nodes_count(3)
ps = env.pageservers[0]
env.storage_controller.node_delete(ps.id)
env.storage_controller.node_delete_old(ps.id)
# After deletion, the node count must be reduced
assert_nodes_count(2)
@@ -3530,18 +3683,21 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
# some small tests for the scheduling policy querying and returning APIs
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Active")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
# change back to paused again
assert (
newest_info["scheduling_policy"] == "Activating"
or newest_info["scheduling_policy"] == "Active"
)
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
# change back to active again
target.safekeeper_scheduling_policy(inserted["id"], "Active")
def storcon_heartbeat():
assert env.storage_controller.log_contains(
@@ -3554,6 +3710,57 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
def test_safekeeper_activating_to_active(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_configs()
env.start()
fake_id = 5
target = env.storage_controller
assert target.get_safekeeper(fake_id) is None
start_sks = target.get_safekeepers()
sk_0 = env.safekeepers[0]
body = {
"active": True,
"id": fake_id,
"created_at": "2023-10-25T09:11:25Z",
"updated_at": "2024-08-28T11:32:43Z",
"region_id": "aws-eu-central-1",
"host": "localhost",
"port": sk_0.port.pg,
"http_port": sk_0.port.http,
"https_port": None,
"version": 5957,
"availability_zone_id": "eu-central-1a",
}
target.on_safekeeper_deploy(fake_id, body)
inserted = target.get_safekeeper(fake_id)
assert inserted is not None
assert target.get_safekeepers() == start_sks + [inserted]
assert eq_safekeeper_records(body, inserted)
def safekeeper_is_active():
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Active"
wait_until(safekeeper_is_active)
target.safekeeper_scheduling_policy(inserted["id"], "Activating")
wait_until(safekeeper_is_active)
# Now decomission it
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
compared = [dict(a), dict(b)]

View File

@@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
"pitr_interval": "0s" if zero_gc else "3600s",
"gc_period": "0s",
"compaction_period": "0s",
# The test exercises leases API, so we need non-zero lease length.
# If this tests ever does GC, we need to accomodate for the initial lease deadline
# after tenant attach, which is also controlled by this variable.
"lsn_lease_length": "600s",
}
env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -824,9 +828,7 @@ def insert_with_action(
log.info(f"initial size: {initial_size}")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
if action == "lease":
@@ -841,15 +843,9 @@ def insert_with_action(
raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)

View File

@@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
# it is to be in line with the deletion timestamp.. well, almost.
when = original_ancestor[2][:26]
when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC)
now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC)
now = datetime.datetime.now(datetime.UTC)
assert when_ts < now
assert len(lineage.get("reparenting_history", [])) == 0
elif expected_ancestor == timeline_id:
@@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
env.pageserver.quiesce_tenants()
# checking the ancestor after is much faster than waiting for the endpoint not start
# checking the ancestor after is much faster than waiting for the endpoint to start
expected_result = [
("main", env.initial_timeline, None, 24576, 1),
("after", after, env.initial_timeline, 24576, 1),
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1),
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1),
("branch_to_detach", branch_to_detach, None, 16384, 1),
("earlier", earlier, env.initial_timeline, 0, 1),
# (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only)
("main", env.initial_timeline, None, 24576, 1, False),
("after", after, env.initial_timeline, 24576, 1, False),
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True),
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False),
("branch_to_detach", branch_to_detach, None, 16384, 1, False),
("earlier", earlier, env.initial_timeline, 0, 1, False),
]
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result:
for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result:
details = client.timeline_detail(env.initial_tenant, queried_timeline)
ancestor_timeline_id = details["ancestor_timeline_id"]
if expected_ancestor is None:
@@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
assert len(lineage.get("original_ancestor", [])) == 0
assert len(lineage.get("reparenting_history", [])) == 0
for branch_name, queried_timeline, _, rows, starts in expected_result:
details = client.timeline_detail(env.initial_tenant, queried_timeline)
log.info(f"reading data from branch {branch_name}")
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
for branch_name, queried_timeline, _, rows, starts, read_only in expected_result:
last_record_lsn = None
if read_only:
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
details = client.timeline_detail(env.initial_tenant, queried_timeline)
last_record_lsn = Lsn(details["last_record_lsn"])
log.info(f"reading data from branch {branch_name} at {last_record_lsn}")
with env.endpoints.create(
branch_name,
lsn=Lsn(details["last_record_lsn"]),
lsn=last_record_lsn,
) as ep:
ep.start(safekeeper_generation=1)
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -1884,6 +1889,31 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
def test_detach_ancestors_with_no_writes(
neon_env_builder: NeonEnvBuilder,
):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.safe_psql(
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
)
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.stop()
for i in range(0, 5):
if i == 0:
ancestor_name = "main"
else:
ancestor_name = f"b{i}"
tlid = env.create_branch(f"b{i + 1}", ancestor_branch_name=ancestor_name)
client = env.pageserver.http_client()
client.detach_ancestor(tenant_id=env.initial_tenant, timeline_id=tlid)
# TODO:
# - branch near existing L1 boundary, image layers?
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.

View File

@@ -2740,3 +2740,85 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
raise Exception("Uneviction did not happen on source safekeeper yet")
wait_until(unevicted)
def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
"""
Test that the timeline disk usage circuit breaker works as expected. We test that:
1. The circuit breaker kicks in when the timeline's disk usage exceeds the configured limit,
and it causes writes to hang.
2. The hanging writes unblock when the issue resolves (by restarting the safekeeper in the
test to simulate a more realistic production troubleshooting scenario).
3. We can continue to write as normal after the issue resolves.
4. There is no data corruption throughout the test.
"""
# Set up environment with a very small disk usage limit (1KB)
neon_env_builder.num_safekeepers = 1
remote_storage_kind = s3_storage()
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
# Set a very small disk usage limit (1KB)
neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
env = neon_env_builder.init_start()
# Create a timeline and endpoint
env.create_branch("test_timeline_disk_usage_limit")
endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
# Get the safekeeper
sk = env.safekeepers[0]
# Inject a failpoint to stop WAL backup
with sk.http_client() as http_cli:
http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
# Write some data that will exceed the 1KB limit. While the failpoint is active, this operation
# will hang as Postgres encounters safekeeper-returned errors and retries.
def run_hanging_insert():
with closing(endpoint.connect()) as bg_conn:
with bg_conn.cursor() as bg_cur:
# This should generate more than 1KB of WAL
bg_cur.execute("create table t(key int, value text)")
bg_cur.execute("insert into t select generate_series(1,2000), 'payload'")
# Start the inserts in a background thread
bg_thread = threading.Thread(target=run_hanging_insert)
bg_thread.start()
# Wait for the error message to appear in the compute log
def error_logged():
return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
wait_until(error_logged)
log.info("Found expected error message in compute log, resuming.")
# Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
# implemented didn't work as expected.
time.sleep(2)
assert bg_thread.is_alive(), (
"The hanging insert somehow unblocked without resolving the disk usage issue!"
)
log.info("Restarting the safekeeper to resume WAL backup.")
# Restart the safekeeper with defaults to both clear the failpoint and resume the larger disk usage limit.
for sk in env.safekeepers:
sk.stop().start(extra_opts=[])
# The hanging insert will now complete. Join the background thread so that we can
# verify that the insert completed successfully.
bg_thread.join(timeout=120)
assert not bg_thread.is_alive(), "Hanging insert did not complete after safekeeper restart"
log.info("Hanging insert unblocked.")
# Verify we can continue to write as normal
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("insert into t select generate_series(2001,3000), 'payload'")
# Sanity check data correctness
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from t")
# 2000 rows from first insert + 1000 from last insert
assert cur.fetchone() == (3000,)

View File

@@ -13,50 +13,6 @@ if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# we assert below that the walreceiver is not active before data writes.
# with manually created timelines, it is active.
# FIXME: remove this test once we remove timelines_onto_safekeepers
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
env = neon_env_builder.init_start()
env.pageserver.http_client()
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
tenant_id, timeline_id = env.create_tenant()
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
assert "WalReceiver status: Not active" in exception_string, (
"Walreceiver should not be active before any data writes"
)
insert_test_elements(env, tenant_id, start=0, count=1_000)
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
assert "WalReceiver status: Not active" not in exception_string, (
"Should not be inactive anymore after INSERTs are made"
)
assert "WalReceiver status" in exception_string, "But still should have some other status"
# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):

View File

@@ -0,0 +1,21 @@
DO $$
DECLARE
i numeric;
BEGIN
create role somebody;
FOR i IN 1..1000000 LOOP
BEGIN
IF i % 1000 = 0 THEN
alter role somebody password 'welcome';
ELSE
PERFORM 1;
END IF;
EXCEPTION WHEN OTHERS THEN
RAISE WARNING 'error';
END;
IF I = 1000000 THEN
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
END IF;
END LOOP;
END;
$$;

View File

@@ -10,3 +10,4 @@ test: neon-clog
test: neon-test-utils
test: neon-vacuum-full
test: neon-event-triggers
test: neon-subxacts

View File

@@ -0,0 +1,21 @@
DO $$
DECLARE
i numeric;
BEGIN
create role somebody;
FOR i IN 1..1000000 LOOP
BEGIN
IF i % 1000 = 0 THEN
alter role somebody password 'welcome';
ELSE
PERFORM 1;
END IF;
EXCEPTION WHEN OTHERS THEN
RAISE WARNING 'error';
END;
IF I = 1000000 THEN
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
END IF;
END LOOP;
END;
$$;