From e0c12faabda2877171ef80a661a4ba894cf665dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 19 Feb 2024 17:27:02 +0100 Subject: [PATCH] Allow initdb preservation for broken tenants (#6790) Often times the tenants we want to (WAL) DR are the ones which the pageserver marks as broken. Therefore, we should allow initdb preservation also for broken tenants. Fixes #6781. --- pageserver/src/http/routes.rs | 2 +- test_runner/fixtures/pageserver/utils.py | 26 ++++++++++++-- test_runner/regress/test_wal_restore.py | 43 +++++++++++++++++++++--- 3 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 107eed6801..175353762c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler( // location where timeline recreation cand find it. async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; let timeline = tenant .get_timeline(timeline_id, false) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c2281ae25a..201a34f964 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -2,6 +2,7 @@ import time from typing import Any, Dict, List, Optional, Union from mypy_boto3_s3.type_defs import ( + DeleteObjectOutputTypeDef, EmptyResponseMetadataTypeDef, ListObjectsV2OutputTypeDef, ObjectTypeDef, @@ -331,7 +332,6 @@ def list_prefix( """ # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. assert isinstance(remote, S3Storage), "localfs is currently not supported" - assert remote.client is not None prefix_in_bucket = remote.prefix_in_bucket or "" if not prefix: @@ -350,6 +350,29 @@ def list_prefix( return response +def remote_storage_delete_key( + remote: RemoteStorage, + key: str, +) -> DeleteObjectOutputTypeDef: + """ + Note that this function takes into account prefix_in_bucket. + """ + # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now. + assert isinstance(remote, S3Storage), "localfs is currently not supported" + + prefix_in_bucket = remote.prefix_in_bucket or "" + + # real s3 tests have uniqie per test prefix + # mock_s3 tests use special pageserver prefix for pageserver stuff + key = "/".join((prefix_in_bucket, key)) + + response = remote.client.delete_object( + Bucket=remote.bucket_name, + Key=key, + ) + return response + + def enable_remote_storage_versioning( remote: RemoteStorage, ) -> EmptyResponseMetadataTypeDef: @@ -358,7 +381,6 @@ def enable_remote_storage_versioning( """ # local_fs has no support for versioning assert isinstance(remote, S3Storage), "localfs is currently not supported" - assert remote.client is not None # The SDK supports enabling versioning on normal S3 as well but we don't want to change # these settings from a test in a live bucket (also, our access isn't enough nor should it be) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 97db857c74..083a259d85 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -2,6 +2,7 @@ import sys import tarfile import tempfile from pathlib import Path +from typing import List import pytest import zstandard @@ -11,10 +12,17 @@ from fixtures.neon_fixtures import ( PgBin, VanillaPostgres, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + list_prefix, + remote_storage_delete_key, + timeline_delete_wait_completed, +) from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage +from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage from fixtures.types import Lsn, TenantId, TimelineId +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) @pytest.mark.skipif( @@ -128,7 +136,11 @@ def test_wal_restore_initdb( assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] -def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("broken_tenant", [True, False]) +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") endpoint.safe_psql("create table t as select generate_series(1,300000)") @@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): ps_client = env.pageserver.http_client() + if broken_tenant: + env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + ps_client.tenant_break(tenant_id) + # Mark the initdb archive for preservation ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id) # shut down the endpoint and delete the timeline from the pageserver endpoint.stop() - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, S3Storage) - timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) + if broken_tenant: + ps_client.tenant_detach(tenant_id) + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/" + ).get("Contents", []) + for obj in objects: + obj_key = obj["Key"] + if "initdb-preserved.tar.zst" in obj_key: + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + ps_client.tenant_attach(tenant_id, generation=10) + else: + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) # issue the restoration command ps_client.timeline_create(