mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
## Problem This is a follow-up to TODO, as part of the effort to rewire the compute reconfiguration/notification mechanism to make it more robust. Please refer to that commit or ticket BRC-1778 for full context of the problem. ## Summary of changes The previous change added mechanism in `compute_ctl` that makes it possible to refresh the configuration of PG on-demand by having `compute_ctl` go out to download a new config from the control plane/HCC. This change wired this mechanism up with PG so that PG will signal `compute_ctl` to refresh its configuration when it suspects that it could be talking to incorrect pageservers due to a stale configuration. PG will become suspicious that it is talking to the wrong pageservers in the following situations: 1. It cannot connect to a pageserver (e.g., getting a network-level connection refused error) 2. It can connect to a pageserver, but the pageserver does not return any data for the GetPage request 3. It can connect to a pageserver, but the pageserver returns a malformed response 4. It can connect to a pageserver, but there is an error receiving the GetPage request response for any other reason This change also includes a minor tweak to `compute_ctl`'s config refresh behavior. Upon receiving a request to refresh PG configuration, `compute_ctl` will reach out to download a config, but it will not attempt to apply the configuration if the config is the same as the old config is it replacing. This optimization is added because the act of reconfiguring itself requires working pageserver connections. In many failure situations it is likely that PG detects an issue with a pageserver before the control plane can detect the issue, migrate tenants, and update the compute config. In this case even the latest compute config won't point PG to working pageservers, causing the configuration attempt to hang and negatively impact PG's time-to-recovery. With this change, `compute_ctl` only attempts reconfiguration if the refreshed config points PG to different pageservers. ## How is this tested? The new code paths are exercised in all existing tests because this mechanism is on by default. Explicitly tested in `test_runner/regress/test_change_pageserver.py`. Co-authored-by: William Huang <william.huang@databricks.com>
172 lines
6.8 KiB
Python
172 lines
6.8 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
from fixtures.remote_storage import RemoteStorageKind
|
|
|
|
if TYPE_CHECKING:
|
|
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder
|
|
|
|
|
|
def reconfigure_endpoint(endpoint: Endpoint, pageserver_id: int, use_explicit_reconfigure: bool):
|
|
# It's important that we always update config.json before issuing any reconfigure requests
|
|
# to make sure that PG-initiated config refresh doesn't mess things up by reverting to the old config.
|
|
endpoint.update_pageservers_in_config(pageserver_id=pageserver_id)
|
|
|
|
# PG will automatically refresh its configuration if it detects connectivity issues with pageservers.
|
|
# We also allow the test to explicitly request a reconfigure so that the test can be sure that the
|
|
# endpoint is running with the latest configuration.
|
|
#
|
|
# Note that explicit reconfiguration is not required for the system to function or for this test to pass.
|
|
# It is kept for reference as this is how this test used to work before the capability of initiating
|
|
# configuration refreshes was added to compute nodes.
|
|
if use_explicit_reconfigure:
|
|
endpoint.reconfigure(pageserver_id=pageserver_id)
|
|
|
|
|
|
@pytest.mark.parametrize("use_explicit_reconfigure_for_failover", [False, True])
|
|
def test_change_pageserver(
|
|
neon_env_builder: NeonEnvBuilder, use_explicit_reconfigure_for_failover: bool
|
|
):
|
|
"""
|
|
A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this
|
|
is all done via the storage controller, but this test will disable the storage controller's compute
|
|
notifications, and instead update endpoints directly.
|
|
"""
|
|
num_connections = 3
|
|
|
|
neon_env_builder.num_pageservers = 2
|
|
neon_env_builder.enable_pageserver_remote_storage(
|
|
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
|
)
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.create_branch("test_change_pageserver")
|
|
endpoint = env.endpoints.create_start("test_change_pageserver")
|
|
|
|
# Put this tenant into a dual-attached state
|
|
assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0]
|
|
alt_pageserver_id = env.pageservers[1].id
|
|
env.pageservers[1].tenant_attach(env.initial_tenant)
|
|
|
|
pg_conns = [endpoint.connect() for i in range(num_connections)]
|
|
curs = [pg_conn.cursor() for pg_conn in pg_conns]
|
|
|
|
def execute(statement: str):
|
|
for cur in curs:
|
|
cur.execute(statement)
|
|
|
|
def fetchone():
|
|
results = [cur.fetchone() for cur in curs]
|
|
assert all(result == results[0] for result in results)
|
|
return results[0]
|
|
|
|
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
|
# shared_buffers, otherwise the SELECT after restart will just return answer
|
|
# from shared_buffers without hitting the page server, which defeats the point
|
|
# of this test.
|
|
curs[0].execute("CREATE TABLE foo (t text)")
|
|
curs[0].execute(
|
|
"""
|
|
INSERT INTO foo
|
|
SELECT 'long string to consume some space' || g
|
|
FROM generate_series(1, 100000) g
|
|
"""
|
|
)
|
|
|
|
# Verify that the table is larger than shared_buffers
|
|
curs[0].execute(
|
|
"""
|
|
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
|
from pg_settings where name = 'shared_buffers'
|
|
"""
|
|
)
|
|
row = curs[0].fetchone()
|
|
assert row is not None
|
|
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
|
assert int(row[0]) < int(row[1])
|
|
|
|
execute("SELECT count(*) FROM foo")
|
|
assert fetchone() == (100000,)
|
|
|
|
# Reconfigure the endpoint to use the alt pageserver. We issue an explicit reconfigure request here
|
|
# regardless of test mode as this is testing the externally driven reconfiguration scenario, not the
|
|
# compute-initiated reconfiguration scenario upon detecting failures.
|
|
reconfigure_endpoint(endpoint, pageserver_id=alt_pageserver_id, use_explicit_reconfigure=True)
|
|
|
|
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
|
|
execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
|
|
connstring = fetchone()
|
|
assert connstring is not None
|
|
expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
|
|
assert expected_connstring == expected_connstring
|
|
|
|
env.pageservers[
|
|
0
|
|
].stop() # Stop the old pageserver just to make sure we're reading from the new one
|
|
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
|
|
|
execute("SELECT count(*) FROM foo")
|
|
assert fetchone() == (100000,)
|
|
|
|
# Try failing back, and this time we will stop the current pageserver before reconfiguring
|
|
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
|
|
# is more like what happens in an unexpected pageserver failure.
|
|
#
|
|
# Since we're dual-attached, need to tip-off storage controller to treat the one we're
|
|
# about to start as the attached pageserver
|
|
env.pageservers[0].start()
|
|
env.pageservers[1].stop()
|
|
env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"})
|
|
env.storage_controller.reconcile_until_idle()
|
|
|
|
reconfigure_endpoint(
|
|
endpoint,
|
|
pageserver_id=env.pageservers[0].id,
|
|
use_explicit_reconfigure=use_explicit_reconfigure_for_failover,
|
|
)
|
|
|
|
endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
|
|
|
|
execute("SELECT count(*) FROM foo")
|
|
assert fetchone() == (100000,)
|
|
|
|
env.pageservers[0].stop()
|
|
env.pageservers[1].start()
|
|
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
|
env.storage_controller.reconcile_until_idle()
|
|
|
|
# Test a (former) bug where a child process spins without updating its connection string
|
|
# by executing a query separately. This query will hang until we issue the reconfigure.
|
|
async def reconfigure_async():
|
|
await asyncio.sleep(
|
|
1
|
|
) # Sleep for 1 second just to make sure we actually started our count(*) query
|
|
reconfigure_endpoint(
|
|
endpoint,
|
|
pageserver_id=env.pageservers[1].id,
|
|
use_explicit_reconfigure=use_explicit_reconfigure_for_failover,
|
|
)
|
|
|
|
def execute_count():
|
|
execute("SELECT count(*) FROM FOO")
|
|
|
|
async def execute_and_reconfigure():
|
|
task_exec = asyncio.to_thread(execute_count)
|
|
task_reconfig = asyncio.create_task(reconfigure_async())
|
|
await asyncio.gather(
|
|
task_exec,
|
|
task_reconfig,
|
|
)
|
|
|
|
asyncio.run(execute_and_reconfigure())
|
|
assert fetchone() == (100000,)
|
|
|
|
# One final check that nothing hangs
|
|
execute("SELECT count(*) FROM foo")
|
|
assert fetchone() == (100000,)
|