mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-25 23:29:59 +00:00
## Problem While configuring or reconfiguring PG due to PageServer movements, it's possible PG may get stuck if PageServer is moved around after fetching the spec from StorageController. ## Summary of changes To fix this issue, this PR introduces two changes: 1. Fail the PG query directly if the query cannot request configuration for certain number of times. 2. Introduce a new state `RefreshConfiguration` in compute tools to differentiate it from `RefreshConfigurationPending`. If compute tool is already in `RefreshConfiguration` state, then it will not accept new request configuration requests. ## How is this tested? Chaos testing. Co-authored-by: Chen Luo <chen.luo@databricks.com>
138 lines
5.8 KiB
Python
138 lines
5.8 KiB
Python
import json
|
|
import shutil
|
|
|
|
from fixtures.common_types import TenantShardId
|
|
from fixtures.log_helper import log
|
|
from fixtures.metrics import parse_metrics
|
|
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
|
|
from requests.exceptions import ConnectionError
|
|
|
|
|
|
# Helper function to attempt reconfiguration of the compute to point to a new pageserver. Note that in these tests,
|
|
# we don't expect the reconfiguration attempts to go through, as we will be pointing the compute at a "wrong" pageserver.
|
|
def _attempt_reconfiguration(endpoint: Endpoint, new_pageserver_id: int, timeout_sec: float):
|
|
try:
|
|
endpoint.reconfigure(pageserver_id=new_pageserver_id, timeout_sec=timeout_sec)
|
|
except Exception as e:
|
|
log.info(f"reconfiguration failed with exception {e}")
|
|
pass
|
|
|
|
|
|
def read_misrouted_metric_value(pageserver: NeonPageserver) -> float:
|
|
return (
|
|
pageserver.http_client()
|
|
.get_metrics()
|
|
.query_one("pageserver_misrouted_pagestream_requests_total")
|
|
.value
|
|
)
|
|
|
|
|
|
def read_request_error_metric_value(endpoint: Endpoint) -> float:
|
|
return (
|
|
parse_metrics(endpoint.http_client().metrics())
|
|
.query_one("pg_cctl_pagestream_request_errors_total")
|
|
.value
|
|
)
|
|
|
|
|
|
def test_misrouted_to_secondary(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Tests that the following metrics are incremented when compute tries to talk to a secondary pageserver:
|
|
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
|
|
- On compute: pg_cctl_pagestream_request_errors_total
|
|
"""
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.start()
|
|
env.storage_controller.start()
|
|
for ps in env.pageservers:
|
|
ps.start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
# Create a tenant that has one primary and one secondary. Due to primary/secondary placement constraints,
|
|
# the primary and secondary pageservers will be different.
|
|
tenant_id, _ = env.create_tenant(shard_count=1, placement_policy=json.dumps({"Attached": 1}))
|
|
endpoint = env.endpoints.create(
|
|
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
|
|
)
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
# Get the primary pageserver serving the zero shard of the tenant, and detach it from the primary pageserver.
|
|
# This test operation configures tenant directly on the pageserver/does not go through the storage controller,
|
|
# so the compute does not get any notifications and will keep pointing at the detached pageserver.
|
|
tenant_zero_shard = TenantShardId(tenant_id, shard_number=0, shard_count=1)
|
|
|
|
primary_ps = env.get_tenant_pageserver(tenant_zero_shard)
|
|
secondary_ps = (
|
|
env.pageservers[1] if primary_ps.id == env.pageservers[0].id else env.pageservers[0]
|
|
)
|
|
|
|
# Now try to point the compute at the pageserver that is acting as secondary for the tenant. Test that the metrics
|
|
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
|
|
assert read_misrouted_metric_value(secondary_ps) == 0
|
|
assert read_request_error_metric_value(endpoint) == 0
|
|
_attempt_reconfiguration(endpoint, new_pageserver_id=secondary_ps.id, timeout_sec=2.0)
|
|
assert read_misrouted_metric_value(secondary_ps) > 0
|
|
try:
|
|
assert read_request_error_metric_value(endpoint) > 0
|
|
except ConnectionError:
|
|
# When configuring PG to use misconfigured pageserver, PG will cancel the query after certain number of failed
|
|
# reconfigure attempts. This will cause compute_ctl to exit.
|
|
log.info("Cannot connect to PG, ignoring")
|
|
pass
|
|
|
|
|
|
def test_misrouted_to_ps_not_hosting_tenant(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Tests that the following metrics are incremented when compute tries to talk to a pageserver that does not host the tenant:
|
|
- On pageserver receiving the request: pageserver_misrouted_pagestream_requests_total
|
|
- On compute: pg_cctl_pagestream_request_errors_total
|
|
"""
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.start()
|
|
env.storage_controller.start(handle_ps_local_disk_loss=False)
|
|
for ps in env.pageservers:
|
|
ps.start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
tenant_id, _ = env.create_tenant(shard_count=1)
|
|
endpoint = env.endpoints.create(
|
|
"main", tenant_id=tenant_id, config_lines=["neon.lakebase_mode = true"]
|
|
)
|
|
endpoint.respec(skip_pg_catalog_updates=False)
|
|
endpoint.start()
|
|
|
|
tenant_ps_id = env.get_tenant_pageserver(
|
|
TenantShardId(tenant_id, shard_number=0, shard_count=1)
|
|
).id
|
|
non_hosting_ps = (
|
|
env.pageservers[1] if tenant_ps_id == env.pageservers[0].id else env.pageservers[0]
|
|
)
|
|
|
|
# Clear the disk of the non-hosting PS to make sure that it indeed doesn't have any information about the tenant.
|
|
non_hosting_ps.stop(immediate=True)
|
|
shutil.rmtree(non_hosting_ps.tenant_dir())
|
|
non_hosting_ps.start()
|
|
|
|
# Now try to point the compute to the non-hosting pageserver. Test that the metrics
|
|
# on both compute_ctl and the pageserver register the misrouted requests following the reconfiguration attempt.
|
|
assert read_misrouted_metric_value(non_hosting_ps) == 0
|
|
assert read_request_error_metric_value(endpoint) == 0
|
|
_attempt_reconfiguration(endpoint, new_pageserver_id=non_hosting_ps.id, timeout_sec=2.0)
|
|
assert read_misrouted_metric_value(non_hosting_ps) > 0
|
|
try:
|
|
assert read_request_error_metric_value(endpoint) > 0
|
|
except ConnectionError:
|
|
# When configuring PG to use misconfigured pageserver, PG will cancel the query after certain number of failed
|
|
# reconfigure attempts. This will cause compute_ctl to exit.
|
|
log.info("Cannot connect to PG, ignoring")
|
|
pass
|