mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
storage controller: enable timeline CRUD operations to run concurrently with reconciliation & make them safer (#8783)
## Problem - If a reconciler was waiting to be able to notify computes about a change, but the control plane was waiting for the controller to finish a timeline creation/deletion, the overall system can deadlock. - If a tenant shard was migrated concurrently with a timeline creation/deletion, there was a risk that the timeline operation could be applied to a non-latest-generation location, and thereby not really be persistent. This has never happened in practice, but would eventually happen at scale. Closes: #8743 ## Summary of changes - Introduce `Service::tenant_remote_mutation` helper, which looks up shards & generations and passes them into an inner function that may do remote I/O to pageservers. Before returning success, this helper checks that generations haven't incremented, to guarantee that changes are persistent. - Convert tenant_timeline_create, tenant_timeline_delete, and tenant_timeline_detach_ancestor to use this helper. - These functions no longer block on ensure_attached unless the tenant was never attached at all, so they should make progress even if we can't complete compute notifications. This increases the database load from timeline/create operations, but only with cheap read transactions.
This commit is contained in:
@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.allowed_errors,
|
||||
)
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
|
||||
"""
|
||||
The storage controller implements a subset of the pageserver REST API, for mapping
|
||||
per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those
|
||||
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
auth_token = None
|
||||
if self.auth_enabled:
|
||||
auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
|
||||
return PageserverHttpClient(self.port, lambda: True, auth_token)
|
||||
return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)
|
||||
|
||||
def request(self, method, *args, **kwargs) -> requests.Response:
|
||||
resp = requests.request(method, *args, **kwargs)
|
||||
|
||||
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
|
||||
TokenScope,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
|
||||
ObjectTypeDef,
|
||||
)
|
||||
from pytest_httpserver import HTTPServer
|
||||
from urllib3 import Retry
|
||||
from werkzeug.wrappers.request import Request
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
|
||||
|
||||
# allow for small delay between actually having cancelled and being able reconfigure again
|
||||
wait_until(4, 0.5, reconfigure_node_again)
|
||||
|
||||
|
||||
def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
The storage controller is meant to handle the case where a timeline CRUD operation races
|
||||
with a generation-incrementing change to the tenant: this should trigger a retry so that
|
||||
the operation lands on the highest-generation'd tenant location.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
tenant_id = TenantId.generate()
|
||||
env.storage_controller.tenant_create(tenant_id)
|
||||
|
||||
# Set up a failpoint so that a timeline creation will be very slow
|
||||
failpoint = "timeline-creation-after-uninit"
|
||||
for ps in env.pageservers:
|
||||
ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
|
||||
|
||||
# Start a timeline creation in the background
|
||||
create_timeline_id = TimelineId.generate()
|
||||
futs = []
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2 + len(env.pageservers) + len(env.safekeepers)
|
||||
) as executor:
|
||||
futs.append(
|
||||
executor.submit(
|
||||
env.storage_controller.pageserver_api(
|
||||
retries=Retry(
|
||||
status=0,
|
||||
connect=0, # Disable retries: we want to see the 503
|
||||
)
|
||||
).timeline_create,
|
||||
PgVersion.NOT_SET,
|
||||
tenant_id,
|
||||
create_timeline_id,
|
||||
)
|
||||
)
|
||||
|
||||
def has_hit_failpoint():
|
||||
assert any(
|
||||
ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
|
||||
)
|
||||
|
||||
wait_until(10, 1, has_hit_failpoint)
|
||||
|
||||
# Migrate the tenant while the timeline creation is in progress: this migration will complete once it
|
||||
# can detach from the old pageserver, which will happen once the failpoint completes.
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
|
||||
)
|
||||
|
||||
with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
|
||||
futs[0].result(timeout=20)
|
||||
|
||||
# Timeline creation should work when there isn't a concurrent migration, even though it's
|
||||
# slow (our failpoint is still enabled)
|
||||
env.storage_controller.pageserver_api(
|
||||
retries=Retry(
|
||||
status=0,
|
||||
connect=0, # Disable retries: we want to see the 503
|
||||
)
|
||||
).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
|
||||
|
||||
Reference in New Issue
Block a user