mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-02 13:00:37 +00:00
pageserver/controller: enable tenant deletion without attachment (#7957)
## Problem As described in #7952, the controller's attempt to reconcile a tenant before finally deleting it can get hung up waiting for the compute notification hook to accept updates. The fact that we try and reconcile a tenant at all during deletion is part of a more general design issue (#5080), where deletion was implemented as an operation on attached tenant, requiring the tenant to be attached in order to delete it, which is not in principle necessary. Closes: #7952 ## Summary of changes - In the pageserver deletion API, only do the traditional deletion path if the tenant is attached. If it's secondary, then tear down the secondary location, and then do a remote delete. If it's not attached at all, just do the remote delete. - In the storage controller, instead of ensuring a tenant is attached before deletion, do a best-effort detach of the tenant, and then call into some arbitrary pageserver to issue a deletion of remote content. The pageserver retains its existing delete behavior when invoked on attached locations. We can remove this later when all users of the API are updated to either do a detach-before-delete. This will enable removing the "weird" code paths during startup that sometimes load a tenant and then immediately delete it, and removing the deletion markers on tenants.
This commit is contained in:
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.compute_reconfigure import ComputeReconfigure
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
@@ -18,6 +19,8 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
enable_remote_storage_versioning,
|
||||
list_prefix,
|
||||
remote_storage_delete_key,
|
||||
@@ -839,6 +842,86 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_storage_controller_tenant_deletion(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
compute_reconfigure_listener: ComputeReconfigure,
|
||||
):
|
||||
"""
|
||||
Validate that:
|
||||
- Deleting a tenant deletes all its shards
|
||||
- Deletion does not require the compute notification hook to be responsive
|
||||
- Deleting a tenant also removes all secondary locations
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 4
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.control_plane_compute_hook_api = (
|
||||
compute_reconfigure_listener.control_plane_compute_hook_api
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}'
|
||||
)
|
||||
|
||||
# Ensure all the locations are configured, including secondaries
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
shard_ids = [
|
||||
TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id)
|
||||
]
|
||||
|
||||
# Assert attachments all have local content
|
||||
for shard_id in shard_ids:
|
||||
pageserver = env.get_tenant_pageserver(shard_id)
|
||||
assert pageserver.tenant_dir(shard_id).exists()
|
||||
|
||||
# Assert all shards have some content in remote storage
|
||||
for shard_id in shard_ids:
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(shard_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Break the compute hook: we are checking that deletion does not depend on the compute hook being available
|
||||
def break_hook():
|
||||
raise RuntimeError("Unexpected call to compute hook")
|
||||
|
||||
compute_reconfigure_listener.register_on_notify(break_hook)
|
||||
|
||||
# No retry loop: deletion should complete in one shot without polling for 202 responses, because
|
||||
# it cleanly detaches all the shards first, and then deletes them in remote storage
|
||||
env.storage_controller.pageserver_api().tenant_delete(tenant_id)
|
||||
|
||||
# Assert no pageservers have any local content
|
||||
for pageserver in env.pageservers:
|
||||
for shard_id in shard_ids:
|
||||
assert not pageserver.tenant_dir(shard_id).exists()
|
||||
|
||||
for shard_id in shard_ids:
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(shard_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Assert the tenant is not visible in storage controller API
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_describe(tenant_id)
|
||||
|
||||
|
||||
class Failure:
|
||||
pageserver_id: int
|
||||
|
||||
|
||||
@@ -54,9 +54,26 @@ def test_tenant_delete_smoke(
|
||||
|
||||
# first try to delete non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
||||
with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
|
||||
ps_http.tenant_delete(tenant_id=tenant_id)
|
||||
env.pageserver.allowed_errors.append(".*NotFound.*")
|
||||
env.pageserver.allowed_errors.append(".*simulated failure.*")
|
||||
|
||||
# Check that deleting a non-existent tenant gives the expected result: this is a loop because we
|
||||
# may need to retry on some remote storage errors injected by the test harness
|
||||
while True:
|
||||
try:
|
||||
ps_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
if e.status_code == 500:
|
||||
# This test uses failure injection, which can produce 500s as the pageserver expects
|
||||
# the object store to always be available, and the ListObjects during deletion is generally
|
||||
# an infallible operation
|
||||
assert "simulated failure of remote operation" in e.message
|
||||
elif e.status_code == 404:
|
||||
# This is our expected result: trying to erase a non-existent tenant gives us 404
|
||||
assert "NotFound" in e.message
|
||||
break
|
||||
else:
|
||||
raise
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id=tenant_id,
|
||||
|
||||
Reference in New Issue
Block a user