storage controller: enable timeline CRUD operations to run concurrently with reconciliation & make them safer (#8783)

## Problem - If a reconciler was waiting to be able to notify computes about a change, but the control plane was waiting for the controller to finish a timeline creation/deletion, the overall system can deadlock. - If a tenant shard was migrated concurrently with a timeline creation/deletion, there was a risk that the timeline operation could be applied to a non-latest-generation location, and thereby not really be persistent. This has never happened in practice, but would eventually happen at scale. Closes: #8743 ## Summary of changes - Introduce `Service::tenant_remote_mutation` helper, which looks up shards & generations and passes them into an inner function that may do remote I/O to pageservers. Before returning success, this helper checks that generations haven't incremented, to guarantee that changes are persistent. - Convert tenant_timeline_create, tenant_timeline_delete, and tenant_timeline_detach_ancestor to use this helper. - These functions no longer block on ensure_attached unless the tenant was never attached at all, so they should make progress even if we can't complete compute notifications. This increases the database load from timeline/create operations, but only with cheap read transactions.
2026-01-08 14:02:55 +00:00 · 2024-08-23 18:56:05 +01:00
parent b65a95f12e
commit 0aa1450936
6 changed files with 360 additions and 232 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
            self.allowed_errors,
        )

-    def pageserver_api(self) -> PageserverHttpClient:
+    def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
        """
        The storage controller implements a subset of the pageserver REST API, for mapping
        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        auth_token = None
        if self.auth_enabled:
            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)

    def request(self, method, *args, **kwargs) -> requests.Response:
        resp = requests.request(method, *args, **kwargs)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
    TokenScope,
    last_flush_lsn_upload,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
    assert_prefix_not_empty,
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
    ObjectTypeDef,
 )
 from pytest_httpserver import HTTPServer
+from urllib3 import Retry
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB

    # allow for small delay between actually having cancelled and being able reconfigure again
    wait_until(4, 0.5, reconfigure_node_again)
+
+
+def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller is meant to handle the case where a timeline CRUD operation races
+    with a generation-incrementing change to the tenant: this should trigger a retry so that
+    the operation lands on the highest-generation'd tenant location.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    # Set up a failpoint so that a timeline creation will be very slow
+    failpoint = "timeline-creation-after-uninit"
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
+
+    # Start a timeline creation in the background
+    create_timeline_id = TimelineId.generate()
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(
+                env.storage_controller.pageserver_api(
+                    retries=Retry(
+                        status=0,
+                        connect=0,  # Disable retries: we want to see the 503
+                    )
+                ).timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                create_timeline_id,
+            )
+        )
+
+        def has_hit_failpoint():
+            assert any(
+                ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
+            )
+
+        wait_until(10, 1, has_hit_failpoint)
+
+        # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
+        # can detach from the old pageserver, which will happen once the failpoint completes.
+        env.storage_controller.tenant_shard_migrate(
+            TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
+        )
+
+        with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
+            futs[0].result(timeout=20)
+
+    # Timeline creation should work when there isn't a concurrent migration, even though it's
+    # slow (our failpoint is still enabled)
+    env.storage_controller.pageserver_api(
+        retries=Retry(
+            status=0,
+            connect=0,  # Disable retries: we want to see the 503
+        )
+    ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)