mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
Reason and backtrace are added to the Broken state. Backtrace is automatically collected when tenant entered the broken state. The format for API, CLI and metrics is changed and unified to return tenant state name in camel case. Previously snake case was used for metrics and camel case was used for everything else. Now tenant state field in TenantInfo swagger spec is changed to contain state name in "slug" field and other fields (currently only reason and backtrace for Broken variant in "data" field). To allow for this breaking change state was removed from TenantInfo swagger spec because it was not used anywhere. Please note that the tenant's broken reason is not persisted on disk so the reason is lost when pageserver is restarted. Requires changes to grafana dashboard that monitors tenant states. Closes #3001 --------- Co-authored-by: theirix <theirix@gmail.com>
158 lines
4.7 KiB
Python
158 lines
4.7 KiB
Python
import time
|
|
from typing import Optional
|
|
|
|
from fixtures.log_helper import log
|
|
from fixtures.pageserver.http import PageserverHttpClient
|
|
from fixtures.types import Lsn, TenantId, TimelineId
|
|
|
|
|
|
def assert_tenant_state(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant: TenantId,
|
|
expected_state: str,
|
|
message: Optional[str] = None,
|
|
):
|
|
tenant_status = pageserver_http.tenant_status(tenant)
|
|
log.info(f"tenant_status: {tenant_status}")
|
|
assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
|
|
|
|
|
|
def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
|
|
tenants = pageserver_http.tenant_list()
|
|
matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
|
|
assert len(matching) < 2
|
|
if len(matching) == 0:
|
|
return None
|
|
return matching[0]
|
|
|
|
|
|
def remote_consistent_lsn(
|
|
pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
|
) -> Lsn:
|
|
detail = pageserver_http.timeline_detail(tenant, timeline)
|
|
|
|
if detail["remote_consistent_lsn"] is None:
|
|
# No remote information at all. This happens right after creating
|
|
# a timeline, before any part of it has been uploaded to remote
|
|
# storage yet.
|
|
return Lsn(0)
|
|
else:
|
|
lsn_str = detail["remote_consistent_lsn"]
|
|
assert isinstance(lsn_str, str)
|
|
return Lsn(lsn_str)
|
|
|
|
|
|
def wait_for_upload(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant: TenantId,
|
|
timeline: TimelineId,
|
|
lsn: Lsn,
|
|
):
|
|
"""waits for local timeline upload up to specified lsn"""
|
|
for i in range(20):
|
|
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
|
|
if current_lsn >= lsn:
|
|
log.info("wait finished")
|
|
return
|
|
log.info(
|
|
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
|
lsn, current_lsn, i + 1
|
|
)
|
|
)
|
|
time.sleep(1)
|
|
raise Exception(
|
|
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
|
lsn, current_lsn
|
|
)
|
|
)
|
|
|
|
|
|
def wait_until_tenant_state(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant_id: TenantId,
|
|
expected_state: str,
|
|
iterations: int,
|
|
period: float = 1.0,
|
|
) -> bool:
|
|
"""
|
|
Does not use `wait_until` for debugging purposes
|
|
"""
|
|
for _ in range(iterations):
|
|
try:
|
|
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
|
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
|
if tenant["state"]["slug"] == expected_state:
|
|
return True
|
|
except Exception as e:
|
|
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
|
|
|
time.sleep(period)
|
|
|
|
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
|
|
|
|
|
|
def wait_until_tenant_active(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant_id: TenantId,
|
|
iterations: int = 30,
|
|
period: float = 1.0,
|
|
):
|
|
wait_until_tenant_state(
|
|
pageserver_http,
|
|
tenant_id,
|
|
expected_state="Active",
|
|
iterations=iterations,
|
|
period=period,
|
|
)
|
|
|
|
|
|
def last_record_lsn(
|
|
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
|
) -> Lsn:
|
|
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
|
|
|
lsn_str = detail["last_record_lsn"]
|
|
assert isinstance(lsn_str, str)
|
|
return Lsn(lsn_str)
|
|
|
|
|
|
def wait_for_last_record_lsn(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant: TenantId,
|
|
timeline: TimelineId,
|
|
lsn: Lsn,
|
|
) -> Lsn:
|
|
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
|
for i in range(10):
|
|
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
|
if current_lsn >= lsn:
|
|
return current_lsn
|
|
log.info(
|
|
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
|
lsn, current_lsn, i + 1
|
|
)
|
|
)
|
|
time.sleep(1)
|
|
raise Exception(
|
|
"timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
|
|
)
|
|
|
|
|
|
def wait_for_upload_queue_empty(
|
|
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
|
):
|
|
while True:
|
|
all_metrics = pageserver_http.get_metrics()
|
|
tl = all_metrics.query_all(
|
|
"pageserver_remote_timeline_client_calls_unfinished",
|
|
{
|
|
"tenant_id": str(tenant_id),
|
|
"timeline_id": str(timeline_id),
|
|
},
|
|
)
|
|
assert len(tl) > 0
|
|
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
|
if all(m.value == 0 for m in tl):
|
|
return
|
|
time.sleep(0.2)
|