From 1a1019990a59aa69675e4329d1e5e1e08bbddb71 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 7 Jun 2023 18:25:30 +0300 Subject: [PATCH] map TenantState::Broken to TenantAttachmentStatus::Failed (#4371) ## Problem Attach failures are not reported in public part of the api (in `attachment_status` field of TenantInfo). ## Summary of changes Expose TenantState::Broken as TenantAttachmentStatus::Failed In the way its written Failed status will be reported even if no attachment happened. (I e if tenant become broken on startup). This is in line with other members. I e Active will be resolved to Attached even if no actual attach took place. This can be tweaked if needed. At the current stage it would be overengineering without clear motivation resolves #4344 --- libs/pageserver_api/src/models.rs | 22 ++++++++++++-------- pageserver/src/http/openapi_spec.yml | 24 ++++++++++++++++++---- test_runner/fixtures/pageserver/utils.py | 6 +++--- test_runner/regress/test_remote_storage.py | 7 ++++++- test_runner/regress/test_tenant_detach.py | 2 +- 5 files changed, 43 insertions(+), 18 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 162bf6b294..ddce82324c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -110,12 +110,11 @@ impl TenantState { Self::Active => Attached, // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. // However, it also becomes Broken if the regular load fails. - // We would need a separate TenantState variant to distinguish these cases. - // However, there's no practical difference from Console's perspective. - // It will run a Postgres-level health check as soon as it observes Attached. - // That will fail on Broken tenants. - // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. - Self::Broken { .. } => Attached, + // From Console's perspective there's no practical difference + // because attachment_status is polled by console only during attach operation execution. + Self::Broken { reason, .. } => Failed { + reason: reason.to_owned(), + }, // Why is Stopping a Maybe case? Because, during pageserver shutdown, // we set the Stopping state irrespective of whether the tenant // has finished attaching or not. @@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig { /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] -#[serde(rename_all = "snake_case")] +#[serde(tag = "slug", content = "data", rename_all = "snake_case")] pub enum TenantAttachmentStatus { Maybe, Attached, + Failed { reason: String }, } #[serde_as] @@ -809,7 +809,9 @@ mod tests { "slug": "Active", }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); let original_broken = TenantInfo { @@ -831,7 +833,9 @@ mod tests { } }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); assert_eq!( diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0d912c95e0..1f8298ca3e 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -928,12 +928,28 @@ components: writing to the tenant's S3 state, so, DO NOT ATTACH the tenant to any other pageserver, or we risk split-brain. - `attached` means that the attach operation has completed, - maybe successfully, maybe not. Perform a health check at - the Postgres level to determine healthiness of the tenant. + successfully + - `failed` means that attach has failed. For reason check corresponding `reason` failed. + `failed` is the terminal state, retrying attach call wont resolve the issue. + For example this can be caused by s3 being unreachable. The retry may be implemented + with call to detach, though it would be better to not automate it and inspec failed state + manually before proceeding with a retry. See the tenant `/attach` endpoint for more information. - type: string - enum: [ "maybe", "attached" ] + type: object + required: + - slug + - data + properties: + slug: + type: string + enum: [ "maybe", "attached", "failed" ] + data: + - type: object + properties: + reason: + type: string + TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c558387413..d7ffa633fd 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional +from typing import Any, Dict, Optional from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient @@ -72,7 +72,7 @@ def wait_until_tenant_state( expected_state: str, iterations: int, period: float = 1.0, -) -> bool: +) -> Dict[str, Any]: """ Does not use `wait_until` for debugging purposes """ @@ -81,7 +81,7 @@ def wait_until_tenant_state( tenant = pageserver_http.tenant_status(tenant_id=tenant_id) log.debug(f"Tenant {tenant_id} data: {tenant}") if tenant["state"]["slug"] == expected_state: - return True + return tenant except Exception as e: log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index baef8ecacc..742dbfff95 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -147,7 +147,12 @@ def test_remote_storage_backup_and_restore( # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + assert tenant_info["attachment_status"] == { + "slug": "failed", + "data": {"reason": "storage-sync-list-remote-timelines"}, + } # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 9d0fdcfaf8..2a015d5d17 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -532,7 +532,7 @@ def test_ignored_tenant_reattach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", + test_name="test_ignored_tenant_reattach", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client()