diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 162bf6b294..ddce82324c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -110,12 +110,11 @@ impl TenantState { Self::Active => Attached, // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. // However, it also becomes Broken if the regular load fails. - // We would need a separate TenantState variant to distinguish these cases. - // However, there's no practical difference from Console's perspective. - // It will run a Postgres-level health check as soon as it observes Attached. - // That will fail on Broken tenants. - // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. - Self::Broken { .. } => Attached, + // From Console's perspective there's no practical difference + // because attachment_status is polled by console only during attach operation execution. + Self::Broken { reason, .. } => Failed { + reason: reason.to_owned(), + }, // Why is Stopping a Maybe case? Because, during pageserver shutdown, // we set the Stopping state irrespective of whether the tenant // has finished attaching or not. @@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig { /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] -#[serde(rename_all = "snake_case")] +#[serde(tag = "slug", content = "data", rename_all = "snake_case")] pub enum TenantAttachmentStatus { Maybe, Attached, + Failed { reason: String }, } #[serde_as] @@ -809,7 +809,9 @@ mod tests { "slug": "Active", }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); let original_broken = TenantInfo { @@ -831,7 +833,9 @@ mod tests { } }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); assert_eq!( diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0d912c95e0..1f8298ca3e 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -928,12 +928,28 @@ components: writing to the tenant's S3 state, so, DO NOT ATTACH the tenant to any other pageserver, or we risk split-brain. - `attached` means that the attach operation has completed, - maybe successfully, maybe not. Perform a health check at - the Postgres level to determine healthiness of the tenant. + successfully + - `failed` means that attach has failed. For reason check corresponding `reason` failed. + `failed` is the terminal state, retrying attach call wont resolve the issue. + For example this can be caused by s3 being unreachable. The retry may be implemented + with call to detach, though it would be better to not automate it and inspec failed state + manually before proceeding with a retry. See the tenant `/attach` endpoint for more information. - type: string - enum: [ "maybe", "attached" ] + type: object + required: + - slug + - data + properties: + slug: + type: string + enum: [ "maybe", "attached", "failed" ] + data: + - type: object + properties: + reason: + type: string + TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c558387413..d7ffa633fd 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional +from typing import Any, Dict, Optional from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient @@ -72,7 +72,7 @@ def wait_until_tenant_state( expected_state: str, iterations: int, period: float = 1.0, -) -> bool: +) -> Dict[str, Any]: """ Does not use `wait_until` for debugging purposes """ @@ -81,7 +81,7 @@ def wait_until_tenant_state( tenant = pageserver_http.tenant_status(tenant_id=tenant_id) log.debug(f"Tenant {tenant_id} data: {tenant}") if tenant["state"]["slug"] == expected_state: - return True + return tenant except Exception as e: log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index baef8ecacc..742dbfff95 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -147,7 +147,12 @@ def test_remote_storage_backup_and_restore( # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + assert tenant_info["attachment_status"] == { + "slug": "failed", + "data": {"reason": "storage-sync-list-remote-timelines"}, + } # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 9d0fdcfaf8..2a015d5d17 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -532,7 +532,7 @@ def test_ignored_tenant_reattach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", + test_name="test_ignored_tenant_reattach", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client()