Files
neon/test_runner/regress/test_pageserver_api.py
HaoyuHuang 2c6b327be6 A few PS changes (#12540)
# TLDR
All changes are no-op except some metrics. 

## Summary of changes I
### Pageserver
Added a new global counter metric
`pageserver_pagestream_handler_results_total` that categorizes
pagestream request results according to their outcomes:
1. Success
2. Internal errors
3. Other errors

Internal errors include:
1. Page reconstruction error: This probably indicates a pageserver
bug/corruption
2. LSN timeout error: Could indicate overload or bugs with PS's ability
to reach other components
3. Misrouted request error: Indicates bugs in the Storage Controller/HCC

Other errors include transient errors that are expected during normal
operation or errors indicating bugs with other parts of the system
(e.g., malformed requests, errors due to cancelled operations during PS
shutdown, etc.)    


## Summary of changes II
This PR adds a pageserver endpoint and its counterpart in storage
controller to list visible size of all tenant shards. This will be a
prerequisite of the tenant rebalance command.


## Problem III
We need a way to download WAL
segments/layerfiles from S3 and replay WAL records. We cannot access
production S3 from our laptops directly, and we also can't transfer any
user data out of production systems for GDPR compliance, so we need
solutions.

## Summary of changes III

This PR adds a couple of tools to support the debugging
workflow in production:
1. A new `pagectl download-remote-object` command that can be used to
download remote storage objects assuming the correct access is set up.

## Summary of changes IV
This PR adds a command to list all visible delta and image layers from
index_part. This is useful to debug compaction issues as index_part
often contain a lot of covered layers due to PITR.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
2025-07-10 14:39:38 +00:00

180 lines
6.7 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
DEFAULT_BRANCH_NAME,
NeonEnv,
NeonEnvBuilder,
)
from fixtures.utils import run_only_on_default_postgres, wait_until
if TYPE_CHECKING:
from fixtures.pageserver.http import PageserverHttpClient
def check_client(env: NeonEnv, client: PageserverHttpClient):
pg_version = env.pg_version
initial_tenant = env.initial_tenant
client.check_status()
# check initial tenant is there
assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()}
# create new tenant and check it is also there
tenant_id = TenantId.generate()
env.pageserver.tenant_create(
tenant_id,
generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id),
auth_token=client.auth_token,
)
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
timelines = client.timeline_list(tenant_id)
assert len(timelines) == 0, "initial tenant should not have any timelines"
# create timeline
timeline_id = TimelineId.generate()
client.timeline_create(
pg_version=pg_version,
tenant_id=tenant_id,
new_timeline_id=timeline_id,
)
timelines = client.timeline_list(tenant_id)
assert len(timelines) > 0
# check it is there
assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)}
for timeline in timelines:
timeline_id = TimelineId(timeline["timeline_id"])
timeline_details = client.timeline_detail(
tenant_id=tenant_id,
timeline_id=timeline_id,
include_non_incremental_logical_size=True,
)
assert TenantId(timeline_details["tenant_id"]) == tenant_id
assert TimelineId(timeline_details["timeline_id"]) == timeline_id
def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.create_tenant()
timeline_details = client.timeline_detail(
tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True
)
assert timeline_details.get("wal_source_connstr") is None, (
"Should not be able to connect to WAL streaming without PG compute node running"
)
assert timeline_details.get("last_received_msg_lsn") is None, (
"Should not be able to connect to WAL streaming without PG compute node running"
)
assert timeline_details.get("last_received_msg_ts") is None, (
"Should not be able to connect to WAL streaming without PG compute node running"
)
def expect_updated_msg_lsn(
client: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
prev_msg_lsn: Lsn | None,
) -> Lsn:
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
# a successful `timeline_details` response must contain the below fields
assert "wal_source_connstr" in timeline_details.keys()
assert "last_received_msg_lsn" in timeline_details.keys()
assert "last_received_msg_ts" in timeline_details.keys()
assert timeline_details["last_received_msg_lsn"] is not None, (
"the last received message's LSN is empty"
)
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, (
f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
)
return last_msg_lsn
# Test the WAL-receiver related fields in the response to `timeline_details` API call
#
# These fields used to be returned by a separate API call, but they're part of
# `timeline_details` now.
def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.create_tenant()
endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
# insert something to force sk -> ps message
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
# Make a DB modification then expect getting a new WAL receiver's data.
endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
check_client(env, client)
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
pageserver_token = env.auth_keys.generate_pageserver_token()
with env.pageserver.http_client(auth_token=pageserver_token) as client:
check_client(env, client)
@run_only_on_default_postgres("it does not use any postgres functionality")
def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
with env.pageserver.http_client() as client:
client.timeline_patch_index_part(
tenant_id,
timeline_id,
{"rel_size_migration": "migrating"},
)
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating"
# This is invalid in practice: we should never rollback the migrating state to legacy.
# But we do it here to test the API.
client.timeline_patch_index_part(
tenant_id,
timeline_id,
{"rel_size_migration": "legacy"},
)
assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 1
env = neon_env_builder.init_start()
env.create_tenant(shard_count=4)
env.create_tenant(shard_count=2)
json = env.pageserver.http_client().list_tenant_visible_size()
log.info(f"{json}")
# initial tennat + 2 newly created tenants
assert len(json) == 7