mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-23 06:09:59 +00:00
# TLDR All changes are no-op except some metrics. ## Summary of changes I ### Pageserver Added a new global counter metric `pageserver_pagestream_handler_results_total` that categorizes pagestream request results according to their outcomes: 1. Success 2. Internal errors 3. Other errors Internal errors include: 1. Page reconstruction error: This probably indicates a pageserver bug/corruption 2. LSN timeout error: Could indicate overload or bugs with PS's ability to reach other components 3. Misrouted request error: Indicates bugs in the Storage Controller/HCC Other errors include transient errors that are expected during normal operation or errors indicating bugs with other parts of the system (e.g., malformed requests, errors due to cancelled operations during PS shutdown, etc.) ## Summary of changes II This PR adds a pageserver endpoint and its counterpart in storage controller to list visible size of all tenant shards. This will be a prerequisite of the tenant rebalance command. ## Problem III We need a way to download WAL segments/layerfiles from S3 and replay WAL records. We cannot access production S3 from our laptops directly, and we also can't transfer any user data out of production systems for GDPR compliance, so we need solutions. ## Summary of changes III This PR adds a couple of tools to support the debugging workflow in production: 1. A new `pagectl download-remote-object` command that can be used to download remote storage objects assuming the correct access is set up. ## Summary of changes IV This PR adds a command to list all visible delta and image layers from index_part. This is useful to debug compaction issues as index_part often contain a lot of covered layers due to PITR. --------- Co-authored-by: William Huang <william.huang@databricks.com> Co-authored-by: Chen Luo <chen.luo@databricks.com> Co-authored-by: Vlad Lazar <vlad@neon.tech>
180 lines
6.7 KiB
Python
180 lines
6.7 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from fixtures.common_types import Lsn, TenantId, TimelineId
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
DEFAULT_BRANCH_NAME,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
)
|
|
from fixtures.utils import run_only_on_default_postgres, wait_until
|
|
|
|
if TYPE_CHECKING:
|
|
from fixtures.pageserver.http import PageserverHttpClient
|
|
|
|
|
|
def check_client(env: NeonEnv, client: PageserverHttpClient):
|
|
pg_version = env.pg_version
|
|
initial_tenant = env.initial_tenant
|
|
|
|
client.check_status()
|
|
|
|
# check initial tenant is there
|
|
assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()}
|
|
|
|
# create new tenant and check it is also there
|
|
tenant_id = TenantId.generate()
|
|
env.pageserver.tenant_create(
|
|
tenant_id,
|
|
generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id),
|
|
auth_token=client.auth_token,
|
|
)
|
|
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
|
|
|
|
timelines = client.timeline_list(tenant_id)
|
|
assert len(timelines) == 0, "initial tenant should not have any timelines"
|
|
|
|
# create timeline
|
|
timeline_id = TimelineId.generate()
|
|
client.timeline_create(
|
|
pg_version=pg_version,
|
|
tenant_id=tenant_id,
|
|
new_timeline_id=timeline_id,
|
|
)
|
|
|
|
timelines = client.timeline_list(tenant_id)
|
|
assert len(timelines) > 0
|
|
|
|
# check it is there
|
|
assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)}
|
|
for timeline in timelines:
|
|
timeline_id = TimelineId(timeline["timeline_id"])
|
|
timeline_details = client.timeline_detail(
|
|
tenant_id=tenant_id,
|
|
timeline_id=timeline_id,
|
|
include_non_incremental_logical_size=True,
|
|
)
|
|
|
|
assert TenantId(timeline_details["tenant_id"]) == tenant_id
|
|
assert TimelineId(timeline_details["timeline_id"]) == timeline_id
|
|
|
|
|
|
def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
with env.pageserver.http_client() as client:
|
|
tenant_id, timeline_id = env.create_tenant()
|
|
|
|
timeline_details = client.timeline_detail(
|
|
tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True
|
|
)
|
|
|
|
assert timeline_details.get("wal_source_connstr") is None, (
|
|
"Should not be able to connect to WAL streaming without PG compute node running"
|
|
)
|
|
assert timeline_details.get("last_received_msg_lsn") is None, (
|
|
"Should not be able to connect to WAL streaming without PG compute node running"
|
|
)
|
|
assert timeline_details.get("last_received_msg_ts") is None, (
|
|
"Should not be able to connect to WAL streaming without PG compute node running"
|
|
)
|
|
|
|
|
|
def expect_updated_msg_lsn(
|
|
client: PageserverHttpClient,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
prev_msg_lsn: Lsn | None,
|
|
) -> Lsn:
|
|
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
|
|
|
|
# a successful `timeline_details` response must contain the below fields
|
|
assert "wal_source_connstr" in timeline_details.keys()
|
|
assert "last_received_msg_lsn" in timeline_details.keys()
|
|
assert "last_received_msg_ts" in timeline_details.keys()
|
|
|
|
assert timeline_details["last_received_msg_lsn"] is not None, (
|
|
"the last received message's LSN is empty"
|
|
)
|
|
|
|
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
|
|
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, (
|
|
f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
|
|
)
|
|
|
|
return last_msg_lsn
|
|
|
|
|
|
# Test the WAL-receiver related fields in the response to `timeline_details` API call
|
|
#
|
|
# These fields used to be returned by a separate API call, but they're part of
|
|
# `timeline_details` now.
|
|
def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
with env.pageserver.http_client() as client:
|
|
tenant_id, timeline_id = env.create_tenant()
|
|
endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
|
|
|
|
# insert something to force sk -> ps message
|
|
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
|
# Wait to make sure that we get a latest WAL receiver data.
|
|
# We need to wait here because it's possible that we don't have access to
|
|
# the latest WAL yet, when the `timeline_detail` API is first called.
|
|
# See: https://github.com/neondatabase/neon/issues/1768.
|
|
lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
|
|
|
|
# Make a DB modification then expect getting a new WAL receiver's data.
|
|
endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
|
|
wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
|
|
|
|
|
|
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
with env.pageserver.http_client() as client:
|
|
check_client(env, client)
|
|
|
|
|
|
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.auth_enabled = True
|
|
env = neon_env_builder.init_start()
|
|
|
|
pageserver_token = env.auth_keys.generate_pageserver_token()
|
|
|
|
with env.pageserver.http_client(auth_token=pageserver_token) as client:
|
|
check_client(env, client)
|
|
|
|
|
|
@run_only_on_default_postgres("it does not use any postgres functionality")
|
|
def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
tenant_id = env.initial_tenant
|
|
timeline_id = env.initial_timeline
|
|
with env.pageserver.http_client() as client:
|
|
client.timeline_patch_index_part(
|
|
tenant_id,
|
|
timeline_id,
|
|
{"rel_size_migration": "migrating"},
|
|
)
|
|
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating"
|
|
# This is invalid in practice: we should never rollback the migrating state to legacy.
|
|
# But we do it here to test the API.
|
|
client.timeline_patch_index_part(
|
|
tenant_id,
|
|
timeline_id,
|
|
{"rel_size_migration": "legacy"},
|
|
)
|
|
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
|
|
|
|
|
|
def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.num_pageservers = 1
|
|
env = neon_env_builder.init_start()
|
|
env.create_tenant(shard_count=4)
|
|
env.create_tenant(shard_count=2)
|
|
|
|
json = env.pageserver.http_client().list_tenant_visible_size()
|
|
log.info(f"{json}")
|
|
# initial tennat + 2 newly created tenants
|
|
assert len(json) == 7
|