neon/test_runner/regress/test_tenant_delete.py

from __future__ import annotations

import json
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
from typing import TYPE_CHECKING

import pytest
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
    assert_prefix_empty,
    assert_prefix_not_empty,
    many_small_layers_tenant_config,
    wait_for_upload,
)
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.utils import run_pg_bench_small, wait_until
from fixtures.workload import Workload
from requests.exceptions import ReadTimeout
from werkzeug.wrappers.response import Response

if TYPE_CHECKING:
    from werkzeug.wrappers.request import Request


def error_tolerant_delete(ps_http, tenant_id):
    """
    For tests that inject 500 errors, we must retry repeatedly when issuing deletions
    """
    while True:
        try:
            ps_http.tenant_delete(tenant_id=tenant_id)
        except PageserverApiException as e:
            if e.status_code == 500:
                # This test uses failure injection, which can produce 500s as the pageserver expects
                # the object store to always be available, and the ListObjects during deletion is generally
                # an infallible operation.  This can show up as a clear simulated error, or as a general
                # error during delete_objects()
                assert (
                    "simulated failure of remote operation" in e.message
                    or "failed to delete" in e.message
                )
            else:
                raise
        else:
            # Success, drop out
            break


def test_tenant_delete_smoke(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
):
    neon_env_builder.pageserver_config_override = "test_remote_failures=1"

    remote_storage_kind = s3_storage()
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.extend(
        [
            # The deletion queue will complain when it encounters simulated S3 errors
            ".*deletion executor: DeleteObjects request failed.*",
            # lucky race with stopping from flushing a layer we fail to schedule any uploads
            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
        ]
    )

    ps_http = env.pageserver.http_client()

    # first try to delete non existing tenant
    tenant_id = TenantId.generate()
    env.pageserver.allowed_errors.extend(
        [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"]
    )

    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
    # may need to retry on some remote storage errors injected by the test harness
    error_tolerant_delete(ps_http, tenant_id)

    env.create_tenant(
        tenant_id=tenant_id,
        conf=many_small_layers_tenant_config(),
    )

    # Default tenant and the one we created
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2

    # create two timelines one being the parent of another
    parent = None
    for timeline in ["first", "second"]:
        timeline_id = env.create_branch(timeline, ancestor_branch_name=parent, tenant_id=tenant_id)
        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
            run_pg_bench_small(pg_bin, endpoint.connstr())
            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)

            assert_prefix_not_empty(
                neon_env_builder.pageserver_remote_storage,
                prefix="/".join(
                    (
                        "tenants",
                        str(tenant_id),
                    )
                ),
            )

        parent = timeline

    # Upload a heatmap so that we exercise deletion of that too
    ps_http.tenant_heatmap_upload(tenant_id)

    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
    error_tolerant_delete(ps_http, tenant_id)
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
    assert not tenant_path.exists()

    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix="/".join(
            (
                "tenants",
                str(tenant_id),
            )
        ),
    )

    # Deletion updates the tenant count: the one default tenant remains
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0

    env.pageserver.stop()


def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
    """Reproduction of 2023-11-23 stuck tenants investigation"""

    # do not use default tenant/timeline creation because it would output the failpoint log message too early
    env = neon_env_builder.init_configs()
    env.start()
    pageserver_http = env.pageserver.http_client()

    env.pageserver.allowed_errors.extend(
        [
            # the response hit_pausable_failpoint_and_later_fail
            f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn",
        ]
    )

    env.pageserver.tenant_create(env.initial_tenant)

    failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
    pageserver_http.configure_failpoints((failpoint, "pause"))

    def hit_pausable_failpoint_and_later_fail():
        with pytest.raises(PageserverApiException, match="NotFound: tenant"):
            pageserver_http.timeline_create(
                env.pg_version, env.initial_tenant, env.initial_timeline
            )

    def start_deletion():
        pageserver_http.tenant_delete(env.initial_tenant)

    def has_hit_failpoint():
        assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None

    def deletion_has_started_waiting_for_timelines():
        assert env.pageserver.log_contains("Waiting for timelines...") is not None

    def tenant_is_deleted():
        try:
            pageserver_http.tenant_status(env.initial_tenant)
        except PageserverApiException as e:
            assert e.status_code == 404
        else:
            raise RuntimeError("tenant was still accessible")

    creation = Thread(target=hit_pausable_failpoint_and_later_fail)
    creation.start()

    deletion = None

    try:
        wait_until(has_hit_failpoint)

        # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
        # to shut down.
        deletion = Thread(target=start_deletion)
        deletion.start()

        wait_until(deletion_has_started_waiting_for_timelines)

        pageserver_http.configure_failpoints((failpoint, "off"))

        creation.join()
        deletion.join()

        wait_until(tenant_is_deleted)
    finally:
        creation.join()
        if deletion is not None:
            deletion.join()

    env.pageserver.stop()


def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder):
    """
    Validate that timeline creation executed in parallel with deletion works correctly.

    This is a reproducer for https://github.com/neondatabase/neon/issues/6255
    """
    # The remote storage kind doesn't really matter but we use it for iterations calculation below
    # (and there is no way to reconstruct the used remote storage kind)
    remote_storage_kind = RemoteStorageKind.MOCK_S3
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
    ps_http = env.pageserver.http_client()
    tenant_id = env.initial_tenant

    # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and
    # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get
    # a 404 after the tenant is fully deleted.
    CANCELLED_ERROR = (
        ".*POST.*Cancelled request finished successfully status=503 Service Unavailable"
    )

    # This can occur sometimes.
    CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"

    env.pageserver.allowed_errors.extend(
        [
            # lucky race with stopping from flushing a layer we fail to schedule any uploads
            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
            # We need the http connection close for successful reproduction
            ".*POST.*/timeline.* request was dropped before completing",
            # Timeline creation runs into this error
            CANCELLED_ERROR,
            # Timeline deletion can run into this error during deletion
            CONFLICT_MESSAGE,
            ".*tenant_delete_handler.*still waiting, taking longer than expected.*",
        ]
    )

    BEFORE_INITDB_UPLOAD_FAILPOINT = "before-initdb-upload"
    DELETE_BEFORE_CLEANUP_FAILPOINT = "tenant-delete-before-cleanup-remaining-fs-traces-pausable"

    # Wait just before the initdb upload
    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))

    def timeline_create():
        ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
        raise RuntimeError("creation succeeded even though it shouldn't")

    def tenant_delete():
        def tenant_delete_inner():
            ps_http.tenant_delete(tenant_id)

        wait_until(tenant_delete_inner)

    # We will spawn background threads for timeline creation and tenant deletion.  They will both
    # get blocked on our failpoint.
    with ThreadPoolExecutor(max_workers=1) as executor:
        create_fut = executor.submit(timeline_create)

        def hit_initdb_upload_failpoint():
            env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")

        wait_until(hit_initdb_upload_failpoint)

        def creation_connection_timed_out():
            env.pageserver.assert_log_contains(
                "POST.*/timeline.* request was dropped before completing"
            )

        # Wait so that we hit the timeout and the connection is dropped
        # (But timeline creation still continues)
        wait_until(creation_connection_timed_out)

        with pytest.raises(ReadTimeout):
            # Our creation failed from the client's point of view.
            create_fut.result()

        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))

        delete_fut = executor.submit(tenant_delete)

        def deletion_arrived():
            env.pageserver.assert_log_contains(
                f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
            )

        wait_until(deletion_arrived)

        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))

        # Disable the failpoint and wait for deletion to finish
        ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))

        delete_fut.result()

    # Physical deletion should have happened
    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix="/".join(
            (
                "tenants",
                str(tenant_id),
            )
        ),
    )

    # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
    env.pageserver.assert_log_contains(CANCELLED_ERROR)
    assert not env.pageserver.log_contains(
        ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
    )

    # Zero tenants remain (we deleted the default tenant)
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0

    # We deleted our only tenant, and the scrubber fails if it detects nothing
    neon_env_builder.disable_scrub_on_exit()

    env.pageserver.stop()


def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder):
    """
    Validate that creating and then deleting the tenant both survives the scrubber,
    and that one can run the scrubber without problems.
    """

    remote_storage_kind = RemoteStorageKind.MOCK_S3
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())

    ps_http = env.pageserver.http_client()
    # create a tenant separate from the main tenant so that we have one remaining
    # after we deleted it, as the scrubber treats empty buckets as an error.
    (tenant_id, timeline_id) = env.create_tenant()

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        run_pg_bench_small(pg_bin, endpoint.connstr())
        last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
    ps_http.timeline_checkpoint(tenant_id, timeline_id)
    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
    env.stop()

    healthy, _ = env.storage_scrubber.scan_metadata()
    assert healthy

    timeline_lsns = {
        "tenant_id": f"{tenant_id}",
        "timeline_id": f"{timeline_id}",
        "timeline_start_lsn": f"{last_flush_lsn}",
        "backup_lsn": f"{last_flush_lsn}",
    }

    cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/"
    cloud_admin_token = ""

    def get_branches(request: Request):
        # Compare definition with `BranchData` struct
        dummy_data = {
            "id": "test-branch-id",
            "created_at": "",  # TODO
            "updated_at": "",  # TODO
            "name": "testbranchname",
            "project_id": "test-project-id",
            "timeline_id": f"{timeline_id}",
            "default": False,
            "deleted": False,
            "logical_size": 42000,
            "physical_size": 42000,
            "written_size": 42000,
        }
        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
        log.info(f"got get_branches request: {request.json}")
        return Response(json.dumps(dummy_data), content_type="application/json", status=200)

    make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches)

    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
        timeline_lsns=[timeline_lsns],
        cloud_admin_api_url=cloud_admin_url,
        cloud_admin_api_token=cloud_admin_token,
    )
    assert healthy

    env.start()
    ps_http = env.pageserver.http_client()
    ps_http.tenant_delete(tenant_id)
    env.stop()

    healthy, _ = env.storage_scrubber.scan_metadata()
    assert healthy

    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
        timeline_lsns=[timeline_lsns],
        cloud_admin_api_url=cloud_admin_url,
        cloud_admin_api_token=cloud_admin_token,
    )
    assert healthy


def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    """
    Deleting a tenant should also delete any stale (pre-split) shards from remote storage.
    """
    remote_storage_kind = s3_storage()
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()

    # Create an unsharded tenant.
    tenant_id, timeline_id = env.create_tenant()

    # Write some data.
    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
    workload.init()
    workload.write_rows(256)
    workload.validate()
    workload.stop()

    assert_prefix_not_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix="/".join(("tenants", str(tenant_id))),
    )

    # Upload a heatmap as well.
    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)

    # Split off a few shards, in two rounds.
    env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
    env.storage_controller.tenant_shard_split(tenant_id, shard_count=16)

    # Delete the tenant. This should also delete data for the unsharded and count=4 parents.
    env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id)

    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix="/".join(("tenants", str(tenant_id))),
        delimiter="",  # match partial prefixes, i.e. all shards
    )

    dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*"))
    assert dirs == [], f"found tenant directories: {dirs}"

    # The initial tenant created by the test harness should still be there.
    # Only the tenant we deleted should be removed.
    assert_prefix_not_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix="/".join(("tenants", str(env.initial_tenant))),
    )
    dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*"))
    assert dirs != [], "missing initial tenant directory"

    env.stop()