mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
Merge remote-tracking branch 'origin/main' into problame/standby-horizon-removal-poc-rip-out
This commit is contained in:
@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
|
||||
self.auth = BearerAuth(jwt)
|
||||
|
||||
self.mount("http://", HTTPAdapter())
|
||||
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
|
||||
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
|
||||
|
||||
def dbs_and_roles(self):
|
||||
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
|
||||
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
|
||||
return res.json()
|
||||
|
||||
def prewarm_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
|
||||
res = self.get(self.prewarm_url)
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def prewarm_lfc(self, from_endpoint_id: str | None = None):
|
||||
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
|
||||
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
|
||||
self.post(url, params=params).raise_for_status()
|
||||
self.post(self.prewarm_url, params=params).raise_for_status()
|
||||
self.prewarm_lfc_wait()
|
||||
|
||||
def prewarm_lfc_wait(self):
|
||||
def prewarmed():
|
||||
json = self.prewarm_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, error {err}"
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
|
||||
wait_until(prewarmed, timeout=60)
|
||||
|
||||
def offload_lfc(self):
|
||||
url = f"http://localhost:{self.external_port}/lfc/offload"
|
||||
self.post(url).raise_for_status()
|
||||
def offload_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.offload_url)
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def offload_lfc(self):
|
||||
self.post(self.offload_url).raise_for_status()
|
||||
self.offload_lfc_wait()
|
||||
|
||||
def offload_lfc_wait(self):
|
||||
def offloaded():
|
||||
res = self.get(url)
|
||||
res.raise_for_status()
|
||||
json = res.json()
|
||||
json = self.offload_lfc_status()
|
||||
status, err = json["status"], json.get("error")
|
||||
assert status == "completed", f"{status}, error {err}"
|
||||
assert status == "completed", f"{status}, {err=}"
|
||||
|
||||
wait_until(offloaded)
|
||||
|
||||
|
||||
@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
# BEGIN_HADRON
|
||||
"pageserver_active_storage_operations_count",
|
||||
# END_HADRON
|
||||
"pageserver_current_logical_size",
|
||||
"pageserver_resident_physical_size",
|
||||
"pageserver_io_operations_bytes_total",
|
||||
|
||||
@@ -568,6 +568,8 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
timeout: str | None = None,
|
||||
env: dict[str, str] | None = None,
|
||||
dev: bool = False,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> subprocess.CompletedProcess[str]:
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -593,6 +595,10 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
args.extend(["--create-test-user"])
|
||||
if timeout is not None:
|
||||
args.extend(["--start-timeout", str(timeout)])
|
||||
if autoprewarm:
|
||||
args.extend(["--autoprewarm"])
|
||||
if offload_lfc_interval_seconds is not None:
|
||||
args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
|
||||
if dev:
|
||||
args.extend(["--dev"])
|
||||
|
||||
|
||||
@@ -1875,6 +1875,7 @@ class PageserverSchedulingPolicy(StrEnum):
|
||||
FILLING = "Filling"
|
||||
PAUSE = "Pause"
|
||||
PAUSE_FOR_RESTART = "PauseForRestart"
|
||||
DELETING = "Deleting"
|
||||
|
||||
|
||||
class StorageControllerLeadershipStatus(StrEnum):
|
||||
@@ -2083,14 +2084,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_delete(self, node_id):
|
||||
log.info(f"node_delete({node_id})")
|
||||
def node_delete_old(self, node_id):
|
||||
log.info(f"node_delete_old({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_delete(self, node_id):
|
||||
log.info(f"node_delete({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/delete",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def cancel_node_delete(self, node_id):
|
||||
log.info(f"cancel_node_delete({node_id})")
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/delete",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def tombstone_delete(self, node_id):
|
||||
log.info(f"tombstone_delete({node_id})")
|
||||
self.request(
|
||||
@@ -4353,6 +4370,8 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
basebackup_request_tries: int | None = None,
|
||||
timeout: str | None = None,
|
||||
env: dict[str, str] | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
@@ -4377,6 +4396,8 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
timeout=timeout,
|
||||
env=env,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
self._running.release(1)
|
||||
self.log_config_value("shared_buffers")
|
||||
@@ -4592,6 +4613,8 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple: bool = False,
|
||||
basebackup_request_tries: int | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -4612,6 +4635,8 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
|
||||
return self
|
||||
@@ -4696,6 +4721,8 @@ class EndpointFactory:
|
||||
remote_ext_base_url: str | None = None,
|
||||
pageserver_id: int | None = None,
|
||||
basebackup_request_tries: int | None = None,
|
||||
autoprewarm: bool = False,
|
||||
offload_lfc_interval_seconds: int | None = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -4717,6 +4744,8 @@ class EndpointFactory:
|
||||
remote_ext_base_url=remote_ext_base_url,
|
||||
pageserver_id=pageserver_id,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
autoprewarm=autoprewarm,
|
||||
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
|
||||
)
|
||||
|
||||
def create(
|
||||
|
||||
@@ -111,6 +111,15 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*stalling layer flushes for compaction backpressure.*",
|
||||
".*layer roll waiting for flush due to compaction backpressure.*",
|
||||
".*BatchSpanProcessor.*",
|
||||
# Can happen in tests that purposely wipe pageserver "local disk" data.
|
||||
".*Local data loss suspected.*",
|
||||
# Too many frozen layers error is normal during intensive benchmarks
|
||||
".*too many frozen layers.*",
|
||||
# Transient errors when resolving tenant shards by page service
|
||||
".*Fail to resolve tenant shard in attempt.*",
|
||||
# Expected warnings when pageserver has not refreshed GC info yet
|
||||
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
|
||||
".*No broker updates received for a while.*",
|
||||
*(
|
||||
[
|
||||
r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"
|
||||
|
||||
@@ -112,12 +112,18 @@ class TimelineCreateRequest:
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: MembershipConfiguration
|
||||
current_conf: MembershipConfiguration
|
||||
last_log_term: int
|
||||
flush_lsn: Lsn
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
|
||||
current_conf = MembershipConfiguration.from_json(d["current_conf"])
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
last_log_term = d["last_log_term"]
|
||||
flush_lsn = Lsn(d["flush_lsn"])
|
||||
return TimelineMembershipSwitchResponse(
|
||||
previous_conf, current_conf, last_log_term, flush_lsn
|
||||
)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
|
||||
@@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants(
|
||||
@pytest.mark.parametrize("duration", [20 * 60])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
|
||||
# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
|
||||
# we use 64 clients because typically for a high number of connections we recommend the connection pooler
|
||||
# which by default uses 64 connections
|
||||
@pytest.mark.parametrize("n_clients", [1, 64])
|
||||
# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type,
|
||||
# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered
|
||||
# a good utilization for pageserver.
|
||||
@pytest.mark.parametrize("n_clients", [1, 8])
|
||||
@pytest.mark.parametrize("n_tenants", [1])
|
||||
@pytest.mark.timeout(2400)
|
||||
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
|
||||
@@ -70,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man
|
||||
n_clients: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
|
||||
neon_env_builder,
|
||||
zenbenchmark,
|
||||
pg_bin,
|
||||
n_tenants,
|
||||
pgbench_scale,
|
||||
duration,
|
||||
n_clients,
|
||||
)
|
||||
|
||||
|
||||
@@ -85,7 +92,8 @@ def setup_and_run_pagebench_benchmark(
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
params: dict[str, tuple[Any, dict[str, Any]]] = {}
|
||||
@@ -103,9 +111,7 @@ def setup_and_run_pagebench_benchmark(
|
||||
# configure cache sizes like in prod
|
||||
page_cache_size = 16384
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
|
||||
)
|
||||
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}"
|
||||
|
||||
tracing_config = PageserverTracingConfig(
|
||||
sampling_ratio=(0, 1000),
|
||||
@@ -121,7 +127,10 @@ def setup_and_run_pagebench_benchmark(
|
||||
page_cache_size * 8192,
|
||||
{"unit": "byte"},
|
||||
),
|
||||
"pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
|
||||
"pageserver_config_override.max_file_descriptors": (
|
||||
max_file_descriptors,
|
||||
{"unit": ""},
|
||||
),
|
||||
"pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -416,6 +416,8 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
|
||||
# timeline creation (uploads). mask it out here to avoid flakyness.
|
||||
del success_result["remote_consistent_lsn_visible"]
|
||||
del repeat_result["remote_consistent_lsn_visible"]
|
||||
del success_result["walreceiver_status"]
|
||||
del repeat_result["walreceiver_status"]
|
||||
assert repeat_result == success_result
|
||||
finally:
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -1,61 +1,122 @@
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from enum import Enum
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from fixtures.endpoint.http import EndpointHttpClient
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import USE_LFC
|
||||
from fixtures.utils import USE_LFC, wait_until
|
||||
from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
|
||||
from psycopg2.extensions import cursor as Cursor
|
||||
|
||||
|
||||
class LfcQueryMethod(Enum):
|
||||
COMPUTE_CTL = False
|
||||
POSTGRES = True
|
||||
class PrewarmMethod(StrEnum):
|
||||
POSTGRES = "postgres"
|
||||
COMPUTE_CTL = "compute-ctl"
|
||||
AUTOPREWARM = "autoprewarm"
|
||||
|
||||
|
||||
PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
|
||||
OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
|
||||
QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
|
||||
PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
|
||||
PREWARM_ERR_LABEL = "compute_ctl_lfc_prewarm_errors_total"
|
||||
OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
|
||||
OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
|
||||
METHOD_VALUES = [e for e in PrewarmMethod]
|
||||
METHOD_IDS = [e.value for e in PrewarmMethod]
|
||||
|
||||
|
||||
def check_pinned_entries(cur):
|
||||
# some LFC buffer can be temporary locked by autovacuum or background writer
|
||||
for _ in range(10):
|
||||
def check_pinned_entries(cur: Cursor):
|
||||
"""
|
||||
Wait till none of LFC buffers are pinned
|
||||
"""
|
||||
|
||||
def none_pinned():
|
||||
cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
|
||||
n_pinned = cur.fetchall()[0][0]
|
||||
if n_pinned == 0:
|
||||
break
|
||||
time.sleep(1)
|
||||
assert n_pinned == 0
|
||||
assert cur.fetchall()[0][0] == 0
|
||||
|
||||
wait_until(none_pinned)
|
||||
|
||||
|
||||
def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
|
||||
labels = PREWARM_LABEL, OFFLOAD_LABEL, PREWARM_ERR_LABEL, OFFLOAD_ERR_LABEL
|
||||
return {
|
||||
sample.name: sample.value
|
||||
sample.name: int(sample.value)
|
||||
for family in prom_parse_impl(client.metrics())
|
||||
for sample in family.samples
|
||||
if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL)
|
||||
if sample.name in labels
|
||||
}
|
||||
|
||||
|
||||
def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
client.offload_lfc_wait()
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
status = client.prewarm_lfc_status()
|
||||
assert status["status"] == "not_prewarmed"
|
||||
assert "error" not in status
|
||||
client.offload_lfc()
|
||||
assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
|
||||
parsed = prom_parse(client)
|
||||
desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
|
||||
assert parsed == desired, f"{parsed=} != {desired=}"
|
||||
elif method == PrewarmMethod.POSTGRES:
|
||||
cur.execute("select get_local_cache_state()")
|
||||
return cur.fetchall()[0][0]
|
||||
else:
|
||||
raise AssertionError(f"{method} not in PrewarmMethod")
|
||||
|
||||
|
||||
def prewarm_endpoint(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
client.prewarm_lfc_wait()
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
client.prewarm_lfc()
|
||||
elif method == PrewarmMethod.POSTGRES:
|
||||
cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
|
||||
def check_prewarmed(
|
||||
method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
|
||||
):
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
assert prom_parse(client)[PREWARM_LABEL] == 1
|
||||
elif method == PrewarmMethod.COMPUTE_CTL:
|
||||
assert client.prewarm_lfc_status() == desired_status
|
||||
desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0}
|
||||
assert prom_parse(client) == desired
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
|
||||
def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
|
||||
@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
|
||||
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
"""
|
||||
Test we can offload endpoint's LFC cache to endpoint storage.
|
||||
Test we can prewarm endpoint with LFC cache loaded from endpoint storage.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
n_records = 1000000
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=[
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000",
|
||||
],
|
||||
)
|
||||
cfg = [
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000",
|
||||
]
|
||||
offload_secs = 2
|
||||
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=cfg,
|
||||
autoprewarm=True,
|
||||
offload_lfc_interval_seconds=offload_secs,
|
||||
)
|
||||
else:
|
||||
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
@@ -69,75 +130,64 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
|
||||
lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
|
||||
log.info(f"Inserted {n_records} rows")
|
||||
|
||||
http_client = endpoint.http_client()
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
status = http_client.prewarm_lfc_status()
|
||||
assert status["status"] == "not_prewarmed"
|
||||
assert "error" not in status
|
||||
http_client.offload_lfc()
|
||||
assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
|
||||
else:
|
||||
pg_cur.execute("select get_local_cache_state()")
|
||||
lfc_state = pg_cur.fetchall()[0][0]
|
||||
client = endpoint.http_client()
|
||||
lfc_state = offload_lfc(method, client, pg_cur)
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
if method == PrewarmMethod.AUTOPREWARM:
|
||||
endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
|
||||
else:
|
||||
endpoint.start()
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
|
||||
lfc_conn = endpoint.connect(dbname="lfc")
|
||||
lfc_cur = lfc_conn.cursor()
|
||||
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
http_client.prewarm_lfc()
|
||||
else:
|
||||
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
prewarm_endpoint(method, client, pg_cur, lfc_state)
|
||||
|
||||
pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
|
||||
lfc_used_pages = pg_cur.fetchall()[0][0]
|
||||
log.info(f"Used LFC size: {lfc_used_pages}")
|
||||
pg_cur.execute("select * from get_prewarm_info()")
|
||||
prewarm_info = pg_cur.fetchall()[0]
|
||||
log.info(f"Prewarm info: {prewarm_info}")
|
||||
total, prewarmed, skipped, _ = prewarm_info
|
||||
total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
|
||||
log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
|
||||
progress = (prewarmed + skipped) * 100 // total
|
||||
log.info(f"Prewarm progress: {progress}%")
|
||||
|
||||
assert lfc_used_pages > 10000
|
||||
assert (
|
||||
prewarm_info[0] > 0
|
||||
and prewarm_info[1] > 0
|
||||
and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
|
||||
)
|
||||
assert total > 0
|
||||
assert prewarmed > 0
|
||||
assert total == prewarmed + skipped
|
||||
|
||||
lfc_cur.execute("select sum(pk) from t")
|
||||
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
|
||||
|
||||
check_pinned_entries(pg_cur)
|
||||
|
||||
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
assert http_client.prewarm_lfc_status() == desired
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
|
||||
check_prewarmed(method, client, desired)
|
||||
|
||||
|
||||
# autoprewarm isn't needed as we prewarm manually
|
||||
WORKLOAD_VALUES = METHOD_VALUES[:-1]
|
||||
WORKLOAD_IDS = METHOD_IDS[:-1]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
|
||||
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
|
||||
@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
|
||||
def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
|
||||
"""
|
||||
Test continiously prewarming endpoint when there is a write-heavy workload going in parallel
|
||||
"""
|
||||
env = neon_simple_env
|
||||
n_records = 10000
|
||||
n_threads = 4
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=[
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000000",
|
||||
],
|
||||
)
|
||||
cfg = [
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=1GB",
|
||||
"neon.file_cache_size_limit=1GB",
|
||||
"neon.file_cache_prewarm_limit=1000000",
|
||||
]
|
||||
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
pg_cur = pg_conn.cursor()
|
||||
@@ -154,12 +204,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
|
||||
log.info(f"Inserted {n_records} rows")
|
||||
|
||||
http_client = endpoint.http_client()
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
http_client.offload_lfc()
|
||||
else:
|
||||
pg_cur.execute("select get_local_cache_state()")
|
||||
lfc_state = pg_cur.fetchall()[0][0]
|
||||
|
||||
lfc_state = offload_lfc(method, http_client, pg_cur)
|
||||
running = True
|
||||
n_prewarms = 0
|
||||
|
||||
@@ -170,8 +215,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
|
||||
while running:
|
||||
src = random.randint(1, n_records)
|
||||
dst = random.randint(1, n_records)
|
||||
lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
|
||||
lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
|
||||
lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
|
||||
lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
|
||||
n_transfers += 1
|
||||
log.info(f"Number of transfers: {n_transfers}")
|
||||
|
||||
@@ -183,13 +228,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
|
||||
pg_cur.execute("select pg_reload_conf()")
|
||||
pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
|
||||
pg_cur.execute("select pg_reload_conf()")
|
||||
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
# Same thing as prewarm_lfc(), testing other method
|
||||
http_client.prewarm_lfc(endpoint.endpoint_id)
|
||||
else:
|
||||
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
|
||||
|
||||
prewarm_endpoint(method, http_client, pg_cur, lfc_state)
|
||||
nonlocal n_prewarms
|
||||
n_prewarms += 1
|
||||
log.info(f"Number of prewarms: {n_prewarms}")
|
||||
@@ -203,7 +242,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
|
||||
prewarm_thread = threading.Thread(target=prewarm)
|
||||
prewarm_thread.start()
|
||||
|
||||
time.sleep(20)
|
||||
def prewarmed():
|
||||
assert n_prewarms > 5
|
||||
|
||||
wait_until(prewarmed)
|
||||
|
||||
running = False
|
||||
for t in workload_threads:
|
||||
@@ -215,5 +257,12 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
|
||||
assert total_balance == 0
|
||||
|
||||
check_pinned_entries(pg_cur)
|
||||
if query is LfcQueryMethod.COMPUTE_CTL:
|
||||
assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}
|
||||
if method == PrewarmMethod.POSTGRES:
|
||||
return
|
||||
desired = {
|
||||
OFFLOAD_LABEL: 1,
|
||||
PREWARM_LABEL: n_prewarms,
|
||||
OFFLOAD_ERR_LABEL: 0,
|
||||
PREWARM_ERR_LABEL: 0,
|
||||
}
|
||||
assert prom_parse(http_client) == desired
|
||||
|
||||
@@ -180,7 +180,7 @@ def test_metric_collection(
|
||||
httpserver.check()
|
||||
|
||||
# Check that at least one bucket output object is present, and that all
|
||||
# can be decompressed and decoded.
|
||||
# can be decompressed and decoded as NDJSON.
|
||||
bucket_dumps = {}
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
|
||||
@@ -188,7 +188,13 @@ def test_metric_collection(
|
||||
file_path = os.path.join(dirpath, file)
|
||||
log.info(file_path)
|
||||
if file.endswith(".gz"):
|
||||
bucket_dumps[file_path] = json.load(gzip.open(file_path))
|
||||
events = []
|
||||
with gzip.open(file_path, "rt") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
events.append(json.loads(line))
|
||||
bucket_dumps[file_path] = {"events": events}
|
||||
|
||||
assert len(bucket_dumps) >= 1
|
||||
assert all("events" in data for data in bucket_dumps.values())
|
||||
|
||||
@@ -27,6 +27,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
[
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from threading import Event
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import pytest
|
||||
@@ -1505,6 +1508,171 @@ def test_sharding_split_failures(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
|
||||
def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test backpressure can ignore new shards during tenant split so that if we abort the split,
|
||||
PG can continue without being blocked.
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = init_shard_count
|
||||
stripe_size = 32
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create(
|
||||
"main",
|
||||
config_lines=[
|
||||
"max_replication_write_lag = 1MB",
|
||||
"databricks.max_wal_mb_per_second = 1",
|
||||
"neon.max_cluster_size = 10GB",
|
||||
],
|
||||
)
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
|
||||
write_done = Event()
|
||||
|
||||
def write_data(write_done):
|
||||
while not write_done.is_set():
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
log.info("write_data thread exiting")
|
||||
|
||||
writer_thread = threading.Thread(target=write_data, args=(write_done,))
|
||||
writer_thread.start()
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
write_done.set()
|
||||
writer_thread.join()
|
||||
|
||||
# writing more data to page servers after split is aborted
|
||||
for _i in range(5000):
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
|
||||
# wait until write lag becomes 0
|
||||
def check_write_lag_is_zero():
|
||||
res = endpoint.safe_psql(
|
||||
"""
|
||||
SELECT
|
||||
pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag
|
||||
FROM neon.backpressure_lsns();
|
||||
""",
|
||||
dbname="databricks_system",
|
||||
log_query=False,
|
||||
)
|
||||
log.info(f"received_lsn_lag = {res[0][0]}")
|
||||
assert res[0][0] == 0
|
||||
|
||||
wait_until(check_write_lag_is_zero)
|
||||
endpoint.stop_and_destroy()
|
||||
|
||||
|
||||
# BEGIN_HADRON
|
||||
def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that page service is able to resolve the correct shard during tenant split without causing query errors
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
WORKER_THREADS = 16
|
||||
ROW_COUNT = 10000
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = 1
|
||||
stripe_size = 16
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
# generate 10MB of data
|
||||
endpoint.safe_psql(
|
||||
f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;"
|
||||
)
|
||||
read_done = Event()
|
||||
|
||||
def read_data(read_done):
|
||||
i = 0
|
||||
while not read_done.is_set() or i < 10:
|
||||
endpoint.safe_psql(
|
||||
f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}",
|
||||
log_query=False,
|
||||
)
|
||||
i += 1
|
||||
log.info(f"read_data thread exiting. Executed {i} queries.")
|
||||
|
||||
reader_threads = []
|
||||
for _i in range(WORKER_THREADS):
|
||||
reader_thread = threading.Thread(target=read_data, args=(read_done,))
|
||||
reader_thread.start()
|
||||
reader_threads.append(reader_thread)
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
# wait until abort is done
|
||||
def check_tenant_status():
|
||||
active_count = 0
|
||||
for i in range(init_shard_count):
|
||||
status = env.pageserver.http_client().tenant_status(
|
||||
TenantShardId(env.initial_tenant, i, init_shard_count)
|
||||
)
|
||||
if status["state"]["slug"] == "Active":
|
||||
active_count += 1
|
||||
assert active_count == 4
|
||||
|
||||
wait_until(check_tenant_status)
|
||||
|
||||
read_done.set()
|
||||
for thread in reader_threads:
|
||||
thread.join()
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
# END_HADRON
|
||||
|
||||
|
||||
def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check a scenario when one of the shards is much slower than others.
|
||||
|
||||
@@ -989,6 +989,105 @@ def test_storage_controller_compute_hook_retry(
|
||||
)
|
||||
|
||||
|
||||
@run_only_on_default_postgres("postgres behavior is not relevant")
|
||||
def test_storage_controller_compute_hook_keep_failing(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
httpserver_listen_address: ListenAddress,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 4
|
||||
neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False}
|
||||
(host, port) = httpserver_listen_address
|
||||
neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
|
||||
|
||||
# Set up CP handler for compute notifications
|
||||
status_by_tenant: dict[TenantId, int] = {}
|
||||
|
||||
def handler(request: Request):
|
||||
notify_request = request.json
|
||||
assert notify_request is not None
|
||||
status = status_by_tenant[TenantId(notify_request["tenant_id"])]
|
||||
log.info(f"Notify request[{status}]: {notify_request}")
|
||||
return Response(status=status)
|
||||
|
||||
httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
|
||||
|
||||
# Run neon environment
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# Create two tenants:
|
||||
# - The first tenant is banned by CP and contains only one shard
|
||||
# - The second tenant is allowed by CP and contains four shards
|
||||
banned_tenant = TenantId.generate()
|
||||
status_by_tenant[banned_tenant] = 200 # we will ban this tenant later
|
||||
env.create_tenant(banned_tenant, placement_policy='{"Attached": 1}')
|
||||
|
||||
shard_count = 4
|
||||
allowed_tenant = TenantId.generate()
|
||||
status_by_tenant[allowed_tenant] = 200
|
||||
env.create_tenant(allowed_tenant, shard_count=shard_count, placement_policy='{"Attached": 1}')
|
||||
|
||||
# Find the pageserver of the banned tenant
|
||||
banned_tenant_ps = env.get_tenant_pageserver(banned_tenant)
|
||||
assert banned_tenant_ps is not None
|
||||
alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
|
||||
|
||||
# Stop pageserver and ban tenant to trigger failed reconciliation
|
||||
log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}")
|
||||
status_by_tenant[banned_tenant] = 423
|
||||
banned_tenant_ps.stop()
|
||||
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
|
||||
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
|
||||
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
|
||||
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
|
||||
|
||||
# Migrate all allowed tenant shards to the first alive pageserver
|
||||
# to trigger storage controller optimizations due to affinity rules
|
||||
for shard_number in range(shard_count):
|
||||
log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}")
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(allowed_tenant, shard_number, shard_count),
|
||||
alive_pageservers[0].id,
|
||||
config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
|
||||
)
|
||||
|
||||
# Make some reconcile_all calls to trigger optimizations
|
||||
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
|
||||
RECONCILE_COUNT = 12
|
||||
for i in range(RECONCILE_COUNT):
|
||||
try:
|
||||
n = env.storage_controller.reconcile_all()
|
||||
log.info(f"Reconciliation attempt {i} finished with success: {n}")
|
||||
except StorageControllerApiException as e:
|
||||
assert "Control plane tenant busy" in str(e)
|
||||
log.info(f"Reconciliation attempt {i} finished with failure")
|
||||
|
||||
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
|
||||
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
|
||||
time.sleep(2)
|
||||
|
||||
# Check that the allowed tenant shards are optimized due to affinity rules
|
||||
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
|
||||
not_optimized_shard_count = 0
|
||||
for loc in locations:
|
||||
tsi = TenantShardId.parse(loc[0])
|
||||
if tsi.tenant_id != allowed_tenant:
|
||||
continue
|
||||
if loc[1]["mode"] == "AttachedSingle":
|
||||
not_optimized_shard_count += 1
|
||||
log.info(f"Shard {tsi} seen in mode {loc[1]['mode']}")
|
||||
|
||||
assert not_optimized_shard_count < shard_count, "At least one shard should be optimized"
|
||||
|
||||
# Unban the tenant and run reconciliations
|
||||
status_by_tenant[banned_tenant] = 200
|
||||
env.storage_controller.reconcile_all()
|
||||
banned_descr = env.storage_controller.tenant_describe(banned_tenant)
|
||||
assert banned_descr["shards"][0]["is_pending_compute_notification"] is False
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this test doesn't start an endpoint")
|
||||
def test_storage_controller_compute_hook_revert(
|
||||
httpserver: HTTPServer,
|
||||
@@ -2522,7 +2621,7 @@ def test_storage_controller_node_deletion(
|
||||
wait_until(assert_shards_migrated)
|
||||
|
||||
log.info(f"Deleting pageserver {victim.id}")
|
||||
env.storage_controller.node_delete(victim.id)
|
||||
env.storage_controller.node_delete_old(victim.id)
|
||||
|
||||
if not while_offline:
|
||||
|
||||
@@ -2557,6 +2656,60 @@ def test_storage_controller_node_deletion(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.num_azs = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_count = 12
|
||||
shard_count_per_tenant = 16
|
||||
tenant_ids = []
|
||||
|
||||
for _ in range(0, tenant_count):
|
||||
tid = TenantId.generate()
|
||||
tenant_ids.append(tid)
|
||||
env.create_tenant(
|
||||
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
|
||||
)
|
||||
|
||||
# Sanity check: initial creations should not leave the system in an unstable scheduling state
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 3
|
||||
|
||||
env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
|
||||
|
||||
ps_id_to_delete = env.pageservers[0].id
|
||||
|
||||
env.storage_controller.warm_up_all_secondaries()
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_delete(ps_id),
|
||||
ps_id_to_delete,
|
||||
max_attempts=3,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
ps_id_to_delete,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.DELETING,
|
||||
max_attempts=6,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
env.storage_controller.cancel_node_delete(ps_id_to_delete)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
ps_id_to_delete,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.ACTIVE,
|
||||
max_attempts=6,
|
||||
backoff=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [None, 2])
|
||||
def test_storage_controller_metadata_health(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -3112,7 +3265,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
|
||||
assert_nodes_count(3)
|
||||
|
||||
ps = env.pageservers[0]
|
||||
env.storage_controller.node_delete(ps.id)
|
||||
env.storage_controller.node_delete_old(ps.id)
|
||||
|
||||
# After deletion, the node count must be reduced
|
||||
assert_nodes_count(2)
|
||||
@@ -3530,18 +3683,21 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
# some small tests for the scheduling policy querying and returning APIs
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Active")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Active"
|
||||
# Ensure idempotency
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Active")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Active"
|
||||
# change back to paused again
|
||||
assert (
|
||||
newest_info["scheduling_policy"] == "Activating"
|
||||
or newest_info["scheduling_policy"] == "Active"
|
||||
)
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
# Ensure idempotency
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Pause")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
# change back to active again
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Active")
|
||||
|
||||
def storcon_heartbeat():
|
||||
assert env.storage_controller.log_contains(
|
||||
@@ -3554,6 +3710,57 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
|
||||
def test_safekeeper_activating_to_active(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
fake_id = 5
|
||||
|
||||
target = env.storage_controller
|
||||
|
||||
assert target.get_safekeeper(fake_id) is None
|
||||
|
||||
start_sks = target.get_safekeepers()
|
||||
|
||||
sk_0 = env.safekeepers[0]
|
||||
|
||||
body = {
|
||||
"active": True,
|
||||
"id": fake_id,
|
||||
"created_at": "2023-10-25T09:11:25Z",
|
||||
"updated_at": "2024-08-28T11:32:43Z",
|
||||
"region_id": "aws-eu-central-1",
|
||||
"host": "localhost",
|
||||
"port": sk_0.port.pg,
|
||||
"http_port": sk_0.port.http,
|
||||
"https_port": None,
|
||||
"version": 5957,
|
||||
"availability_zone_id": "eu-central-1a",
|
||||
}
|
||||
|
||||
target.on_safekeeper_deploy(fake_id, body)
|
||||
|
||||
inserted = target.get_safekeeper(fake_id)
|
||||
assert inserted is not None
|
||||
assert target.get_safekeepers() == start_sks + [inserted]
|
||||
assert eq_safekeeper_records(body, inserted)
|
||||
|
||||
def safekeeper_is_active():
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Active"
|
||||
|
||||
wait_until(safekeeper_is_active)
|
||||
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Activating")
|
||||
|
||||
wait_until(safekeeper_is_active)
|
||||
|
||||
# Now decomission it
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
|
||||
@@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
|
||||
"pitr_interval": "0s" if zero_gc else "3600s",
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# The test exercises leases API, so we need non-zero lease length.
|
||||
# If this tests ever does GC, we need to accomodate for the initial lease deadline
|
||||
# after tenant attach, which is also controlled by this variable.
|
||||
"lsn_lease_length": "600s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
@@ -824,9 +828,7 @@ def insert_with_action(
|
||||
log.info(f"initial size: {initial_size}")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
|
||||
|
||||
if action == "lease":
|
||||
@@ -841,15 +843,9 @@ def insert_with_action(
|
||||
raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute(
|
||||
"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute(
|
||||
"CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
|
||||
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
|
||||
|
||||
|
||||
@@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
|
||||
# it is to be in line with the deletion timestamp.. well, almost.
|
||||
when = original_ancestor[2][:26]
|
||||
when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC)
|
||||
now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC)
|
||||
now = datetime.datetime.now(datetime.UTC)
|
||||
assert when_ts < now
|
||||
assert len(lineage.get("reparenting_history", [])) == 0
|
||||
elif expected_ancestor == timeline_id:
|
||||
@@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
|
||||
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
# checking the ancestor after is much faster than waiting for the endpoint not start
|
||||
# checking the ancestor after is much faster than waiting for the endpoint to start
|
||||
expected_result = [
|
||||
("main", env.initial_timeline, None, 24576, 1),
|
||||
("after", after, env.initial_timeline, 24576, 1),
|
||||
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1),
|
||||
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1),
|
||||
("branch_to_detach", branch_to_detach, None, 16384, 1),
|
||||
("earlier", earlier, env.initial_timeline, 0, 1),
|
||||
# (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only)
|
||||
("main", env.initial_timeline, None, 24576, 1, False),
|
||||
("after", after, env.initial_timeline, 24576, 1, False),
|
||||
("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True),
|
||||
("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False),
|
||||
("branch_to_detach", branch_to_detach, None, 16384, 1, False),
|
||||
("earlier", earlier, env.initial_timeline, 0, 1, False),
|
||||
]
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
|
||||
for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result:
|
||||
for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result:
|
||||
details = client.timeline_detail(env.initial_tenant, queried_timeline)
|
||||
ancestor_timeline_id = details["ancestor_timeline_id"]
|
||||
if expected_ancestor is None:
|
||||
@@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
|
||||
assert len(lineage.get("original_ancestor", [])) == 0
|
||||
assert len(lineage.get("reparenting_history", [])) == 0
|
||||
|
||||
for branch_name, queried_timeline, _, rows, starts in expected_result:
|
||||
details = client.timeline_detail(env.initial_tenant, queried_timeline)
|
||||
log.info(f"reading data from branch {branch_name}")
|
||||
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
|
||||
for branch_name, queried_timeline, _, rows, starts, read_only in expected_result:
|
||||
last_record_lsn = None
|
||||
if read_only:
|
||||
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
|
||||
details = client.timeline_detail(env.initial_tenant, queried_timeline)
|
||||
last_record_lsn = Lsn(details["last_record_lsn"])
|
||||
|
||||
log.info(f"reading data from branch {branch_name} at {last_record_lsn}")
|
||||
with env.endpoints.create(
|
||||
branch_name,
|
||||
lsn=Lsn(details["last_record_lsn"]),
|
||||
lsn=last_record_lsn,
|
||||
) as ep:
|
||||
ep.start(safekeeper_generation=1)
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
@@ -1884,6 +1889,31 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
|
||||
|
||||
def test_detach_ancestors_with_no_writes(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
|
||||
)
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.stop()
|
||||
|
||||
for i in range(0, 5):
|
||||
if i == 0:
|
||||
ancestor_name = "main"
|
||||
else:
|
||||
ancestor_name = f"b{i}"
|
||||
|
||||
tlid = env.create_branch(f"b{i + 1}", ancestor_branch_name=ancestor_name)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
client.detach_ancestor(tenant_id=env.initial_tenant, timeline_id=tlid)
|
||||
|
||||
|
||||
# TODO:
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
|
||||
@@ -2740,3 +2740,85 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
|
||||
raise Exception("Uneviction did not happen on source safekeeper yet")
|
||||
|
||||
wait_until(unevicted)
|
||||
|
||||
|
||||
def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that the timeline disk usage circuit breaker works as expected. We test that:
|
||||
1. The circuit breaker kicks in when the timeline's disk usage exceeds the configured limit,
|
||||
and it causes writes to hang.
|
||||
2. The hanging writes unblock when the issue resolves (by restarting the safekeeper in the
|
||||
test to simulate a more realistic production troubleshooting scenario).
|
||||
3. We can continue to write as normal after the issue resolves.
|
||||
4. There is no data corruption throughout the test.
|
||||
"""
|
||||
# Set up environment with a very small disk usage limit (1KB)
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
|
||||
|
||||
# Set a very small disk usage limit (1KB)
|
||||
neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Create a timeline and endpoint
|
||||
env.create_branch("test_timeline_disk_usage_limit")
|
||||
endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
|
||||
|
||||
# Get the safekeeper
|
||||
sk = env.safekeepers[0]
|
||||
|
||||
# Inject a failpoint to stop WAL backup
|
||||
with sk.http_client() as http_cli:
|
||||
http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
|
||||
|
||||
# Write some data that will exceed the 1KB limit. While the failpoint is active, this operation
|
||||
# will hang as Postgres encounters safekeeper-returned errors and retries.
|
||||
def run_hanging_insert():
|
||||
with closing(endpoint.connect()) as bg_conn:
|
||||
with bg_conn.cursor() as bg_cur:
|
||||
# This should generate more than 1KB of WAL
|
||||
bg_cur.execute("create table t(key int, value text)")
|
||||
bg_cur.execute("insert into t select generate_series(1,2000), 'payload'")
|
||||
|
||||
# Start the inserts in a background thread
|
||||
bg_thread = threading.Thread(target=run_hanging_insert)
|
||||
bg_thread.start()
|
||||
|
||||
# Wait for the error message to appear in the compute log
|
||||
def error_logged():
|
||||
return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
|
||||
|
||||
wait_until(error_logged)
|
||||
log.info("Found expected error message in compute log, resuming.")
|
||||
|
||||
# Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
|
||||
# implemented didn't work as expected.
|
||||
time.sleep(2)
|
||||
assert bg_thread.is_alive(), (
|
||||
"The hanging insert somehow unblocked without resolving the disk usage issue!"
|
||||
)
|
||||
|
||||
log.info("Restarting the safekeeper to resume WAL backup.")
|
||||
# Restart the safekeeper with defaults to both clear the failpoint and resume the larger disk usage limit.
|
||||
for sk in env.safekeepers:
|
||||
sk.stop().start(extra_opts=[])
|
||||
|
||||
# The hanging insert will now complete. Join the background thread so that we can
|
||||
# verify that the insert completed successfully.
|
||||
bg_thread.join(timeout=120)
|
||||
assert not bg_thread.is_alive(), "Hanging insert did not complete after safekeeper restart"
|
||||
log.info("Hanging insert unblocked.")
|
||||
|
||||
# Verify we can continue to write as normal
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("insert into t select generate_series(2001,3000), 'payload'")
|
||||
|
||||
# Sanity check data correctness
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
# 2000 rows from first insert + 1000 from last insert
|
||||
assert cur.fetchone() == (3000,)
|
||||
|
||||
@@ -13,50 +13,6 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
|
||||
|
||||
# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
|
||||
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
|
||||
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
||||
# we assert below that the walreceiver is not active before data writes.
|
||||
# with manually created timelines, it is active.
|
||||
# FIXME: remove this test once we remove timelines_onto_safekeepers
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": False,
|
||||
}
|
||||
|
||||
# Trigger WAL wait timeout faster
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.http_client()
|
||||
|
||||
# In this test we force 'Timed out while waiting for WAL record error' while
|
||||
# fetching basebackup and don't want any retries.
|
||||
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
|
||||
|
||||
tenant_id, timeline_id = env.create_tenant()
|
||||
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
||||
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
||||
|
||||
try:
|
||||
trigger_wait_lsn_timeout(env, tenant_id)
|
||||
except Exception as e:
|
||||
exception_string = str(e)
|
||||
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||
assert "WalReceiver status: Not active" in exception_string, (
|
||||
"Walreceiver should not be active before any data writes"
|
||||
)
|
||||
|
||||
insert_test_elements(env, tenant_id, start=0, count=1_000)
|
||||
try:
|
||||
trigger_wait_lsn_timeout(env, tenant_id)
|
||||
except Exception as e:
|
||||
exception_string = str(e)
|
||||
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||
assert "WalReceiver status: Not active" not in exception_string, (
|
||||
"Should not be inactive anymore after INSERTs are made"
|
||||
)
|
||||
assert "WalReceiver status" in exception_string, "But still should have some other status"
|
||||
|
||||
|
||||
# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
|
||||
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
|
||||
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
21
test_runner/sql_regress/expected/neon-subxacts.out
Normal file
21
test_runner/sql_regress/expected/neon-subxacts.out
Normal file
@@ -0,0 +1,21 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
i numeric;
|
||||
BEGIN
|
||||
create role somebody;
|
||||
FOR i IN 1..1000000 LOOP
|
||||
BEGIN
|
||||
IF i % 1000 = 0 THEN
|
||||
alter role somebody password 'welcome';
|
||||
ELSE
|
||||
PERFORM 1;
|
||||
END IF;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE WARNING 'error';
|
||||
END;
|
||||
IF I = 1000000 THEN
|
||||
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
|
||||
END IF;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
@@ -10,3 +10,4 @@ test: neon-clog
|
||||
test: neon-test-utils
|
||||
test: neon-vacuum-full
|
||||
test: neon-event-triggers
|
||||
test: neon-subxacts
|
||||
|
||||
21
test_runner/sql_regress/sql/neon-subxacts.sql
Normal file
21
test_runner/sql_regress/sql/neon-subxacts.sql
Normal file
@@ -0,0 +1,21 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
i numeric;
|
||||
BEGIN
|
||||
create role somebody;
|
||||
FOR i IN 1..1000000 LOOP
|
||||
BEGIN
|
||||
IF i % 1000 = 0 THEN
|
||||
alter role somebody password 'welcome';
|
||||
ELSE
|
||||
PERFORM 1;
|
||||
END IF;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE WARNING 'error';
|
||||
END;
|
||||
IF I = 1000000 THEN
|
||||
PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
|
||||
END IF;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
Reference in New Issue
Block a user