pageserver: cleanup redundant create/attach code, fix detach while attaching (#6277)

## Problem

The code for tenant create and tenant attach was just a special case of
what upsert_location does.

## Summary of changes

- Use `upsert_location` for create and attach APIs
- Clean up error handling in upsert_location so that it can generate
appropriate HTTP response codes
- Update tests that asserted the old non-idempotent behavior of attach
- Rework the `test_ignore_while_attaching` test, and fix tenant shutdown
during activation, which this test was supposed to cover, but it was
actually just waiting for activation to complete.
This commit is contained in:
John Spray
2024-01-09 10:37:54 +00:00
committed by GitHub
parent 8186f6b6f9
commit 4b9b4c2c36
8 changed files with 234 additions and 348 deletions

View File

@@ -1917,18 +1917,24 @@ class NeonPageserver(PgProtocol):
return None
def tenant_attach(
self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False
self,
tenant_id: TenantId,
config: None | Dict[str, Any] = None,
config_null: bool = False,
generation: Optional[int] = None,
):
"""
Tenant attachment passes through here to acquire a generation number before proceeding
to call into the pageserver HTTP client.
"""
client = self.http_client()
if generation is None:
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
return client.tenant_attach(
tenant_id,
config,
config_null,
generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
generation=generation,
)
def tenant_detach(self, tenant_id: TenantId):

View File

@@ -144,8 +144,11 @@ def test_remote_storage_backup_and_restore(
# Introduce failpoint in list remote timelines code path to make tenant_attach fail.
# This is before the failures injected by test_remote_failures, so it's a permanent error.
pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
env.pageserver.allowed_errors.append(
".*attach failed.*: storage-sync-list-remote-timelines",
env.pageserver.allowed_errors.extend(
[
".*attach failed.*: storage-sync-list-remote-timelines",
".*Tenant state is Broken: storage-sync-list-remote-timelines.*",
]
)
# Attach it. This HTTP request will succeed and launch a
# background task to load the tenant. In that background task,
@@ -159,9 +162,13 @@ def test_remote_storage_backup_and_restore(
"data": {"reason": "storage-sync-list-remote-timelines"},
}
# Ensure that even though the tenant is broken, we can't attach it again.
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
env.pageserver.tenant_attach(tenant_id)
# Ensure that even though the tenant is broken, retrying the attachment fails
with pytest.raises(Exception, match="Tenant state is Broken"):
# Use same generation as in previous attempt
gen_state = env.attachment_service.inspect(tenant_id)
assert gen_state is not None
generation = gen_state[0]
env.pageserver.tenant_attach(tenant_id, generation=generation)
# Restart again, this implicitly clears the failpoint.
# test_remote_failures=1 remains active, though, as it's in the pageserver config.
@@ -176,10 +183,8 @@ def test_remote_storage_backup_and_restore(
), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
env.pageserver.start()
# Ensure that the pageserver remembers that the tenant was attaching, by
# trying to attach it again. It should fail.
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
env.pageserver.tenant_attach(tenant_id)
# The attach should have got far enough that it recovers on restart (i.e. tenant's
# config was written to local storage).
log.info("waiting for tenant to become active. this should be quick with on-demand download")
wait_until_tenant_active(

View File

@@ -627,7 +627,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
def test_load_negatives(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -644,25 +644,16 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
):
env.pageserver.tenant_load(tenant_id)
with pytest.raises(
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
env.pageserver.tenant_attach(tenant_id)
pageserver_http.tenant_ignore(tenant_id)
env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
with pytest.raises(
expected_exception=PageserverApiException,
match="tenant directory already exists",
):
env.pageserver.tenant_attach(tenant_id)
def test_ignore_while_attaching(
def test_detach_while_activating(
neon_env_builder: NeonEnvBuilder,
):
"""
Test cancellation behavior for tenants that are stuck somewhere between
being attached and reaching Active state.
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
@@ -684,39 +675,28 @@ def test_ignore_while_attaching(
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
# Detach it
pageserver_http.tenant_detach(tenant_id)
# And re-attach, but stop attach task_mgr task from completing
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")])
env.pageserver.tenant_attach(tenant_id)
# Run ignore on the task, thereby cancelling the attach.
# XXX This should take priority over attach, i.e., it should cancel the attach task.
# But neither the failpoint, nor the proper remote_timeline_client download functions,
# are sensitive to task_mgr::shutdown.
# This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
# So, for now, effectively, this ignore here will block until attach task completes.
pageserver_http.tenant_ignore(tenant_id)
# Cannot attach it due to some local files existing
env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
with pytest.raises(
expected_exception=PageserverApiException,
match="tenant directory already exists",
):
env.pageserver.tenant_attach(tenant_id)
# The tenant is in the Activating state. This should not block us from
# shutting it down and detaching it.
pageserver_http.tenant_detach(tenant_id)
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
assert len(tenants_after_ignore) + 1 == len(
tenants_before_ignore
tenants_after_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert tenant_id not in tenants_after_detach, "Detached tenant should be missing"
assert len(tenants_after_detach) + 1 == len(
tenants_before_detach
), "Only ignored tenant should be missing"
# Calling load will bring the tenant back online
# Subsequently attaching it again should still work
pageserver_http.configure_failpoints([("attach-before-activate", "off")])
env.pageserver.tenant_load(tenant_id)
env.pageserver.tenant_attach(tenant_id)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
endpoint.stop()

View File

@@ -29,18 +29,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
initial_tenants = sorted(
map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
)
initial_tenant_dirs = [d for d in tenants_dir.iterdir()]
[d for d in tenants_dir.iterdir()]
neon_simple_env.pageserver.allowed_errors.extend(
[
".*Failed to create directory structure for tenant .*, cleaning tmp data.*",
".*Failed to fsync removed temporary tenant directory .*",
]
)
neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
pageserver_http = neon_simple_env.pageserver.http_client()
pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"):
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
with pytest.raises(Exception, match="tenant-config-before-write"):
_ = neon_simple_env.neon_cli.create_tenant()
new_tenants = sorted(
@@ -48,10 +43,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
)
assert initial_tenants == new_tenants, "should not create new tenants"
new_tenant_dirs = [d for d in tenants_dir.iterdir()]
assert (
new_tenant_dirs == initial_tenant_dirs
), "pageserver should clean its temp tenant dirs on tenant creation failure"
# Any files left behind on disk during failed creation do not prevent
# a retry from succeeding.
pageserver_http.configure_failpoints(("tenant-config-before-write", "off"))
neon_simple_env.neon_cli.create_tenant()
def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):