mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
map TenantState::Broken to TenantAttachmentStatus::Failed (#4371)
## Problem Attach failures are not reported in public part of the api (in `attachment_status` field of TenantInfo). ## Summary of changes Expose TenantState::Broken as TenantAttachmentStatus::Failed In the way its written Failed status will be reported even if no attachment happened. (I e if tenant become broken on startup). This is in line with other members. I e Active will be resolved to Attached even if no actual attach took place. This can be tweaked if needed. At the current stage it would be overengineering without clear motivation resolves #4344
This commit is contained in:
@@ -110,12 +110,11 @@ impl TenantState {
|
||||
Self::Active => Attached,
|
||||
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
|
||||
// However, it also becomes Broken if the regular load fails.
|
||||
// We would need a separate TenantState variant to distinguish these cases.
|
||||
// However, there's no practical difference from Console's perspective.
|
||||
// It will run a Postgres-level health check as soon as it observes Attached.
|
||||
// That will fail on Broken tenants.
|
||||
// Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
|
||||
Self::Broken { .. } => Attached,
|
||||
// From Console's perspective there's no practical difference
|
||||
// because attachment_status is polled by console only during attach operation execution.
|
||||
Self::Broken { reason, .. } => Failed {
|
||||
reason: reason.to_owned(),
|
||||
},
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
@@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig {
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
pub enum TenantAttachmentStatus {
|
||||
Maybe,
|
||||
Attached,
|
||||
Failed { reason: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -809,7 +809,9 @@ mod tests {
|
||||
"slug": "Active",
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
@@ -831,7 +833,9 @@ mod tests {
|
||||
}
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
|
||||
@@ -928,12 +928,28 @@ components:
|
||||
writing to the tenant's S3 state, so, DO NOT ATTACH the
|
||||
tenant to any other pageserver, or we risk split-brain.
|
||||
- `attached` means that the attach operation has completed,
|
||||
maybe successfully, maybe not. Perform a health check at
|
||||
the Postgres level to determine healthiness of the tenant.
|
||||
successfully
|
||||
- `failed` means that attach has failed. For reason check corresponding `reason` failed.
|
||||
`failed` is the terminal state, retrying attach call wont resolve the issue.
|
||||
For example this can be caused by s3 being unreachable. The retry may be implemented
|
||||
with call to detach, though it would be better to not automate it and inspec failed state
|
||||
manually before proceeding with a retry.
|
||||
|
||||
See the tenant `/attach` endpoint for more information.
|
||||
type: string
|
||||
enum: [ "maybe", "attached" ]
|
||||
type: object
|
||||
required:
|
||||
- slug
|
||||
- data
|
||||
properties:
|
||||
slug:
|
||||
type: string
|
||||
enum: [ "maybe", "attached", "failed" ]
|
||||
data:
|
||||
- type: object
|
||||
properties:
|
||||
reason:
|
||||
type: string
|
||||
|
||||
TenantCreateRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import time
|
||||
from typing import Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
@@ -72,7 +72,7 @@ def wait_until_tenant_state(
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
period: float = 1.0,
|
||||
) -> bool:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Does not use `wait_until` for debugging purposes
|
||||
"""
|
||||
@@ -81,7 +81,7 @@ def wait_until_tenant_state(
|
||||
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
||||
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
||||
if tenant["state"]["slug"] == expected_state:
|
||||
return True
|
||||
return tenant
|
||||
except Exception as e:
|
||||
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
||||
|
||||
|
||||
@@ -147,7 +147,12 @@ def test_remote_storage_backup_and_restore(
|
||||
# listing the remote timelines will fail because of the failpoint,
|
||||
# and the tenant will be marked as Broken.
|
||||
client.tenant_attach(tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
|
||||
|
||||
tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
|
||||
assert tenant_info["attachment_status"] == {
|
||||
"slug": "failed",
|
||||
"data": {"reason": "storage-sync-list-remote-timelines"},
|
||||
}
|
||||
|
||||
# Ensure that even though the tenant is broken, we can't attach it again.
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
|
||||
|
||||
@@ -532,7 +532,7 @@ def test_ignored_tenant_reattach(
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
test_name="test_ignored_tenant_reattach",
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
Reference in New Issue
Block a user