pageserver/controller: enable tenant deletion without attachment (#7957)

## Problem

As described in #7952, the controller's attempt to reconcile a tenant
before finally deleting it can get hung up waiting for the compute
notification hook to accept updates.

The fact that we try and reconcile a tenant at all during deletion is
part of a more general design issue (#5080), where deletion was
implemented as an operation on attached tenant, requiring the tenant to
be attached in order to delete it, which is not in principle necessary.

Closes: #7952

## Summary of changes

- In the pageserver deletion API, only do the traditional deletion path
if the tenant is attached. If it's secondary, then tear down the
secondary location, and then do a remote delete. If it's not attached at
all, just do the remote delete.
- In the storage controller, instead of ensuring a tenant is attached
before deletion, do a best-effort detach of the tenant, and then call
into some arbitrary pageserver to issue a deletion of remote content.

The pageserver retains its existing delete behavior when invoked on
attached locations. We can remove this later when all users of the API
are updated to either do a detach-before-delete. This will enable
removing the "weird" code paths during startup that sometimes load a
tenant and then immediately delete it, and removing the deletion markers
on tenants.
This commit is contained in:
John Spray
2024-06-05 21:22:54 +01:00
committed by GitHub
parent 83ab14e271
commit 91dd99038e
7 changed files with 312 additions and 117 deletions

View File

@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union
import pytest
from fixtures.common_types import TenantId, TenantShardId, TimelineId
from fixtures.compute_reconfigure import ComputeReconfigure
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
@@ -18,6 +19,8 @@ from fixtures.neon_fixtures import (
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
MANY_SMALL_LAYERS_TENANT_CONFIG,
assert_prefix_empty,
assert_prefix_not_empty,
enable_remote_storage_versioning,
list_prefix,
remote_storage_delete_key,
@@ -839,6 +842,86 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
env.storage_controller.consistency_check()
def test_storage_controller_tenant_deletion(
neon_env_builder: NeonEnvBuilder,
compute_reconfigure_listener: ComputeReconfigure,
):
"""
Validate that:
- Deleting a tenant deletes all its shards
- Deletion does not require the compute notification hook to be responsive
- Deleting a tenant also removes all secondary locations
"""
neon_env_builder.num_pageservers = 4
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.control_plane_compute_hook_api = (
compute_reconfigure_listener.control_plane_compute_hook_api
)
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}'
)
# Ensure all the locations are configured, including secondaries
env.storage_controller.reconcile_until_idle()
shard_ids = [
TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id)
]
# Assert attachments all have local content
for shard_id in shard_ids:
pageserver = env.get_tenant_pageserver(shard_id)
assert pageserver.tenant_dir(shard_id).exists()
# Assert all shards have some content in remote storage
for shard_id in shard_ids:
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(shard_id),
)
),
)
# Break the compute hook: we are checking that deletion does not depend on the compute hook being available
def break_hook():
raise RuntimeError("Unexpected call to compute hook")
compute_reconfigure_listener.register_on_notify(break_hook)
# No retry loop: deletion should complete in one shot without polling for 202 responses, because
# it cleanly detaches all the shards first, and then deletes them in remote storage
env.storage_controller.pageserver_api().tenant_delete(tenant_id)
# Assert no pageservers have any local content
for pageserver in env.pageservers:
for shard_id in shard_ids:
assert not pageserver.tenant_dir(shard_id).exists()
for shard_id in shard_ids:
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(shard_id),
)
),
)
# Assert the tenant is not visible in storage controller API
with pytest.raises(StorageControllerApiException):
env.storage_controller.tenant_describe(tenant_id)
class Failure:
pageserver_id: int

View File

@@ -54,9 +54,26 @@ def test_tenant_delete_smoke(
# first try to delete non existing tenant
tenant_id = TenantId.generate()
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
ps_http.tenant_delete(tenant_id=tenant_id)
env.pageserver.allowed_errors.append(".*NotFound.*")
env.pageserver.allowed_errors.append(".*simulated failure.*")
# Check that deleting a non-existent tenant gives the expected result: this is a loop because we
# may need to retry on some remote storage errors injected by the test harness
while True:
try:
ps_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
if e.status_code == 500:
# This test uses failure injection, which can produce 500s as the pageserver expects
# the object store to always be available, and the ListObjects during deletion is generally
# an infallible operation
assert "simulated failure of remote operation" in e.message
elif e.status_code == 404:
# This is our expected result: trying to erase a non-existent tenant gives us 404
assert "NotFound" in e.message
break
else:
raise
env.neon_cli.create_tenant(
tenant_id=tenant_id,