Allow initdb preservation for broken tenants (#6790)

Often times the tenants we want to (WAL) DR are the ones which the
pageserver marks as broken. Therefore, we should allow initdb
preservation also for broken tenants.

Fixes #6781.
This commit is contained in:
Arpad Müller
2024-02-19 17:27:02 +01:00
committed by GitHub
parent 2f8a2681b8
commit e0c12faabd
3 changed files with 63 additions and 8 deletions

View File

@@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler(
// location where timeline recreation cand find it.
async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let timeline = tenant
.get_timeline(timeline_id, false)

View File

@@ -2,6 +2,7 @@ import time
from typing import Any, Dict, List, Optional, Union
from mypy_boto3_s3.type_defs import (
DeleteObjectOutputTypeDef,
EmptyResponseMetadataTypeDef,
ListObjectsV2OutputTypeDef,
ObjectTypeDef,
@@ -331,7 +332,6 @@ def list_prefix(
"""
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None
prefix_in_bucket = remote.prefix_in_bucket or ""
if not prefix:
@@ -350,6 +350,29 @@ def list_prefix(
return response
def remote_storage_delete_key(
remote: RemoteStorage,
key: str,
) -> DeleteObjectOutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
"""
# For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
assert isinstance(remote, S3Storage), "localfs is currently not supported"
prefix_in_bucket = remote.prefix_in_bucket or ""
# real s3 tests have uniqie per test prefix
# mock_s3 tests use special pageserver prefix for pageserver stuff
key = "/".join((prefix_in_bucket, key))
response = remote.client.delete_object(
Bucket=remote.bucket_name,
Key=key,
)
return response
def enable_remote_storage_versioning(
remote: RemoteStorage,
) -> EmptyResponseMetadataTypeDef:
@@ -358,7 +381,6 @@ def enable_remote_storage_versioning(
"""
# local_fs has no support for versioning
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None
# The SDK supports enabling versioning on normal S3 as well but we don't want to change
# these settings from a test in a live bucket (also, our access isn't enough nor should it be)

View File

@@ -2,6 +2,7 @@ import sys
import tarfile
import tempfile
from pathlib import Path
from typing import List
import pytest
import zstandard
@@ -11,10 +12,17 @@ from fixtures.neon_fixtures import (
PgBin,
VanillaPostgres,
)
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.pageserver.utils import (
list_prefix,
remote_storage_delete_key,
timeline_delete_wait_completed,
)
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import LocalFsStorage
from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
from fixtures.types import Lsn, TenantId, TimelineId
from mypy_boto3_s3.type_defs import (
ObjectTypeDef,
)
@pytest.mark.skipif(
@@ -128,7 +136,11 @@ def test_wal_restore_initdb(
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("broken_tenant", [True, False])
def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool):
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
ps_client = env.pageserver.http_client()
if broken_tenant:
env.pageserver.allowed_errors.append(
r".* Changing Active tenant to Broken state, reason: broken from test"
)
ps_client.tenant_break(tenant_id)
# Mark the initdb archive for preservation
ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
# shut down the endpoint and delete the timeline from the pageserver
endpoint.stop()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
assert isinstance(env.pageserver_remote_storage, S3Storage)
timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
if broken_tenant:
ps_client.tenant_detach(tenant_id)
objects: List[ObjectTypeDef] = list_prefix(
env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/"
).get("Contents", [])
for obj in objects:
obj_key = obj["Key"]
if "initdb-preserved.tar.zst" in obj_key:
continue
log.info(f"Deleting key from remote storage: {obj_key}")
remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
pass
ps_client.tenant_attach(tenant_id, generation=10)
else:
timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
# issue the restoration command
ps_client.timeline_create(