storage controller: enable timeline CRUD operations to run concurrently with reconciliation & make them safer (#8783)

## Problem

- If a reconciler was waiting to be able to notify computes about a
change, but the control plane was waiting for the controller to finish a
timeline creation/deletion, the overall system can deadlock.
- If a tenant shard was migrated concurrently with a timeline
creation/deletion, there was a risk that the timeline operation could be
applied to a non-latest-generation location, and thereby not really be
persistent. This has never happened in practice, but would eventually
happen at scale.

Closes: #8743 

## Summary of changes

- Introduce `Service::tenant_remote_mutation` helper, which looks up
shards & generations and passes them into an inner function that may do
remote I/O to pageservers. Before returning success, this helper checks
that generations haven't incremented, to guarantee that changes are
persistent.
- Convert tenant_timeline_create, tenant_timeline_delete, and
tenant_timeline_detach_ancestor to use this helper.
- These functions no longer block on ensure_attached unless the tenant
was never attached at all, so they should make progress even if we can't
complete compute notifications.

This increases the database load from timeline/create operations, but
only with cheap read transactions.
This commit is contained in:
John Spray
2024-08-23 18:56:05 +01:00
committed by GitHub
parent b65a95f12e
commit 0aa1450936
6 changed files with 360 additions and 232 deletions

View File

@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
self.allowed_errors,
)
def pageserver_api(self) -> PageserverHttpClient:
def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
"""
The storage controller implements a subset of the pageserver REST API, for mapping
per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
auth_token = None
if self.auth_enabled:
auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
return PageserverHttpClient(self.port, lambda: True, auth_token)
return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)
def request(self, method, *args, **kwargs) -> requests.Response:
resp = requests.request(method, *args, **kwargs)

View File

@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
TokenScope,
last_flush_lsn_upload,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.utils import (
assert_prefix_empty,
assert_prefix_not_empty,
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
ObjectTypeDef,
)
from pytest_httpserver import HTTPServer
from urllib3 import Retry
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
# allow for small delay between actually having cancelled and being able reconfigure again
wait_until(4, 0.5, reconfigure_node_again)
def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
"""
The storage controller is meant to handle the case where a timeline CRUD operation races
with a generation-incrementing change to the tenant: this should trigger a retry so that
the operation lands on the highest-generation'd tenant location.
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
env.storage_controller.tenant_create(tenant_id)
# Set up a failpoint so that a timeline creation will be very slow
failpoint = "timeline-creation-after-uninit"
for ps in env.pageservers:
ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
# Start a timeline creation in the background
create_timeline_id = TimelineId.generate()
futs = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=2 + len(env.pageservers) + len(env.safekeepers)
) as executor:
futs.append(
executor.submit(
env.storage_controller.pageserver_api(
retries=Retry(
status=0,
connect=0, # Disable retries: we want to see the 503
)
).timeline_create,
PgVersion.NOT_SET,
tenant_id,
create_timeline_id,
)
)
def has_hit_failpoint():
assert any(
ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
)
wait_until(10, 1, has_hit_failpoint)
# Migrate the tenant while the timeline creation is in progress: this migration will complete once it
# can detach from the old pageserver, which will happen once the failpoint completes.
env.storage_controller.tenant_shard_migrate(
TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
)
with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
futs[0].result(timeout=20)
# Timeline creation should work when there isn't a concurrent migration, even though it's
# slow (our failpoint is still enabled)
env.storage_controller.pageserver_api(
retries=Retry(
status=0,
connect=0, # Disable retries: we want to see the 503
)
).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)