Merge remote-tracking branch 'origin/main' into problame/standby-horizon-removal-poc-rip-out

This commit is contained in:
Christian Schwarz
2025-07-09 11:10:53 +00:00
193 changed files with 8745 additions and 3023 deletions

View File

@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
self.auth = BearerAuth(jwt)
self.mount("http://", HTTPAdapter())
self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
self.offload_url = f"http://localhost:{external_port}/lfc/offload"
def dbs_and_roles(self):
res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
return res.json()
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
res = self.get(self.prewarm_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self, from_endpoint_id: str | None = None):
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(url, params=params).raise_for_status()
self.post(self.prewarm_url, params=params).raise_for_status()
self.prewarm_lfc_wait()
def prewarm_lfc_wait(self):
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(prewarmed, timeout=60)
def offload_lfc(self):
url = f"http://localhost:{self.external_port}/lfc/offload"
self.post(url).raise_for_status()
def offload_lfc_status(self) -> dict[str, str]:
res = self.get(self.offload_url)
res.raise_for_status()
json: dict[str, str] = res.json()
return json
def offload_lfc(self):
self.post(self.offload_url).raise_for_status()
self.offload_lfc_wait()
def offload_lfc_wait(self):
def offloaded():
res = self.get(url)
res.raise_for_status()
json = res.json()
json = self.offload_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
assert status == "completed", f"{status}, {err=}"
wait_until(offloaded)

View File

@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
)
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
# BEGIN_HADRON
"pageserver_active_storage_operations_count",
# END_HADRON
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",

View File

@@ -568,6 +568,8 @@ class NeonLocalCli(AbstractNeonCli):
timeout: str | None = None,
env: dict[str, str] | None = None,
dev: bool = False,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -593,6 +595,10 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--create-test-user"])
if timeout is not None:
args.extend(["--start-timeout", str(timeout)])
if autoprewarm:
args.extend(["--autoprewarm"])
if offload_lfc_interval_seconds is not None:
args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
if dev:
args.extend(["--dev"])

View File

@@ -1875,6 +1875,7 @@ class PageserverSchedulingPolicy(StrEnum):
FILLING = "Filling"
PAUSE = "Pause"
PAUSE_FOR_RESTART = "PauseForRestart"
DELETING = "Deleting"
class StorageControllerLeadershipStatus(StrEnum):
@@ -2083,14 +2084,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
def node_delete_old(self, node_id):
log.info(f"node_delete_old({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
def node_delete(self, node_id):
log.info(f"node_delete({node_id})")
self.request(
"PUT",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def cancel_node_delete(self, node_id):
log.info(f"cancel_node_delete({node_id})")
self.request(
"DELETE",
f"{self.api}/control/v1/node/{node_id}/delete",
headers=self.headers(TokenScope.ADMIN),
)
def tombstone_delete(self, node_id):
log.info(f"tombstone_delete({node_id})")
self.request(
@@ -4353,6 +4370,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries: int | None = None,
timeout: str | None = None,
env: dict[str, str] | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Start the Postgres instance.
@@ -4377,6 +4396,8 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries=basebackup_request_tries,
timeout=timeout,
env=env,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
self._running.release(1)
self.log_config_value("shared_buffers")
@@ -4592,6 +4613,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id: int | None = None,
allow_multiple: bool = False,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Self:
"""
Create an endpoint, apply config, and start Postgres.
@@ -4612,6 +4635,8 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
return self
@@ -4696,6 +4721,8 @@ class EndpointFactory:
remote_ext_base_url: str | None = None,
pageserver_id: int | None = None,
basebackup_request_tries: int | None = None,
autoprewarm: bool = False,
offload_lfc_interval_seconds: int | None = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4717,6 +4744,8 @@ class EndpointFactory:
remote_ext_base_url=remote_ext_base_url,
pageserver_id=pageserver_id,
basebackup_request_tries=basebackup_request_tries,
autoprewarm=autoprewarm,
offload_lfc_interval_seconds=offload_lfc_interval_seconds,
)
def create(

View File

@@ -111,6 +111,15 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*stalling layer flushes for compaction backpressure.*",
".*layer roll waiting for flush due to compaction backpressure.*",
".*BatchSpanProcessor.*",
# Can happen in tests that purposely wipe pageserver "local disk" data.
".*Local data loss suspected.*",
# Too many frozen layers error is normal during intensive benchmarks
".*too many frozen layers.*",
# Transient errors when resolving tenant shards by page service
".*Fail to resolve tenant shard in attempt.*",
# Expected warnings when pageserver has not refreshed GC info yet
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
".*No broker updates received for a while.*",
*(
[
r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"

View File

@@ -112,12 +112,18 @@ class TimelineCreateRequest:
class TimelineMembershipSwitchResponse:
previous_conf: MembershipConfiguration
current_conf: MembershipConfiguration
last_log_term: int
flush_lsn: Lsn
@classmethod
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
current_conf = MembershipConfiguration.from_json(d["current_conf"])
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
last_log_term = d["last_log_term"]
flush_lsn = Lsn(d["flush_lsn"])
return TimelineMembershipSwitchResponse(
previous_conf, current_conf, last_log_term, flush_lsn
)
class SafekeeperHttpClient(requests.Session, MetricsGetter):