mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
Allow initdb preservation for broken tenants (#6790)
Often times the tenants we want to (WAL) DR are the ones which the pageserver marks as broken. Therefore, we should allow initdb preservation also for broken tenants. Fixes #6781.
This commit is contained in:
@@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
|
||||
@@ -2,6 +2,7 @@ import time
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
DeleteObjectOutputTypeDef,
|
||||
EmptyResponseMetadataTypeDef,
|
||||
ListObjectsV2OutputTypeDef,
|
||||
ObjectTypeDef,
|
||||
@@ -331,7 +332,6 @@ def list_prefix(
|
||||
"""
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
assert remote.client is not None
|
||||
|
||||
prefix_in_bucket = remote.prefix_in_bucket or ""
|
||||
if not prefix:
|
||||
@@ -350,6 +350,29 @@ def list_prefix(
|
||||
return response
|
||||
|
||||
|
||||
def remote_storage_delete_key(
|
||||
remote: RemoteStorage,
|
||||
key: str,
|
||||
) -> DeleteObjectOutputTypeDef:
|
||||
"""
|
||||
Note that this function takes into account prefix_in_bucket.
|
||||
"""
|
||||
# For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
|
||||
prefix_in_bucket = remote.prefix_in_bucket or ""
|
||||
|
||||
# real s3 tests have uniqie per test prefix
|
||||
# mock_s3 tests use special pageserver prefix for pageserver stuff
|
||||
key = "/".join((prefix_in_bucket, key))
|
||||
|
||||
response = remote.client.delete_object(
|
||||
Bucket=remote.bucket_name,
|
||||
Key=key,
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
def enable_remote_storage_versioning(
|
||||
remote: RemoteStorage,
|
||||
) -> EmptyResponseMetadataTypeDef:
|
||||
@@ -358,7 +381,6 @@ def enable_remote_storage_versioning(
|
||||
"""
|
||||
# local_fs has no support for versioning
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
assert remote.client is not None
|
||||
|
||||
# The SDK supports enabling versioning on normal S3 as well but we don't want to change
|
||||
# these settings from a test in a live bucket (also, our access isn't enough nor should it be)
|
||||
|
||||
@@ -2,6 +2,7 @@ import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import zstandard
|
||||
@@ -11,10 +12,17 @@ from fixtures.neon_fixtures import (
|
||||
PgBin,
|
||||
VanillaPostgres,
|
||||
)
|
||||
from fixtures.pageserver.utils import timeline_delete_wait_completed
|
||||
from fixtures.pageserver.utils import (
|
||||
list_prefix,
|
||||
remote_storage_delete_key,
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import LocalFsStorage
|
||||
from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
ObjectTypeDef,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -128,7 +136,11 @@ def test_wal_restore_initdb(
|
||||
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
|
||||
|
||||
|
||||
def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("broken_tenant", [True, False])
|
||||
def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool):
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("create table t as select generate_series(1,300000)")
|
||||
@@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_client = env.pageserver.http_client()
|
||||
|
||||
if broken_tenant:
|
||||
env.pageserver.allowed_errors.append(
|
||||
r".* Changing Active tenant to Broken state, reason: broken from test"
|
||||
)
|
||||
ps_client.tenant_break(tenant_id)
|
||||
|
||||
# Mark the initdb archive for preservation
|
||||
ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
|
||||
|
||||
# shut down the endpoint and delete the timeline from the pageserver
|
||||
endpoint.stop()
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
|
||||
timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
|
||||
if broken_tenant:
|
||||
ps_client.tenant_detach(tenant_id)
|
||||
objects: List[ObjectTypeDef] = list_prefix(
|
||||
env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/"
|
||||
).get("Contents", [])
|
||||
for obj in objects:
|
||||
obj_key = obj["Key"]
|
||||
if "initdb-preserved.tar.zst" in obj_key:
|
||||
continue
|
||||
log.info(f"Deleting key from remote storage: {obj_key}")
|
||||
remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
|
||||
pass
|
||||
|
||||
ps_client.tenant_attach(tenant_id, generation=10)
|
||||
else:
|
||||
timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
|
||||
|
||||
# issue the restoration command
|
||||
ps_client.timeline_create(
|
||||
|
||||
Reference in New Issue
Block a user