neon/test_runner/regress/test_tenant_detach.py

import asyncio
import random
import time
from threading import Thread

import asyncpg
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PageserverApiException,
    PageserverHttpClient,
    Postgres,
    RemoteStorageKind,
    available_remote_storages,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_until,
    wait_until_tenant_state,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar


def do_gc_target(
    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
):
    """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
    try:
        log.info("sending gc http request")
        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
    except Exception as e:
        log.error("do_gc failed: %s", e)
    finally:
        log.info("gc http thread returning")


# Basic detach and re-attach test
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_reattach(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_tenant_reattach",
    )

    # Exercise retry code path by making all uploads and downloads fail for the
    # first time. The retries print INFO-messages to the log; we will check
    # that they are present after the test.
    neon_env_builder.pageserver_config_override = "test_remote_failures=1"

    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
        with pg.cursor() as cur:
            cur.execute("CREATE TABLE t(key int primary key, value text)")
            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

    # Wait for the all data to be processed by the pageserver and uploaded in remote storage
    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)

    # Check that we had to retry the uploads
    assert env.pageserver.log_contains(
        ".*failed to perform remote task UploadLayer.*, will retry.*"
    )
    assert env.pageserver.log_contains(
        ".*failed to perform remote task UploadMetadata.*, will retry.*"
    )

    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
    tenant_metric_filter = {
        "tenant_id": str(tenant_id),
        "timeline_id": str(timeline_id),
    }
    pageserver_last_record_lsn_before_detach = int(
        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
    )

    pageserver_http.tenant_detach(tenant_id)
    pageserver_http.tenant_attach(tenant_id)

    time.sleep(1)  # for metrics propagation

    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
    pageserver_last_record_lsn = int(
        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
    )

    assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn

    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
        with pg.cursor() as cur:
            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000

        # Check that we had to retry the downloads
        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
        assert env.pageserver.log_contains(".*download.*failed, will retry.*")


num_connections = 10
num_rows = 100000
updates_to_perform = 0

updates_started = 0
updates_finished = 0


# Run random UPDATEs on test table. On failure, try again.
async def update_table(pg_conn: asyncpg.Connection):
    global updates_started, updates_finished, updates_to_perform

    while updates_started < updates_to_perform or updates_to_perform == 0:
        updates_started += 1
        id = random.randrange(1, num_rows)

        # Loop to retry until the UPDATE succeeds
        while True:
            try:
                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
                updates_finished += 1
                if updates_finished % 1000 == 0:
                    log.info(f"update {updates_finished} / {updates_to_perform}")
                break
            except asyncpg.PostgresError as e:
                # Received error from Postgres. Log it, sleep a little, and continue
                log.info(f"UPDATE error: {e}")
                await asyncio.sleep(0.1)


async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
    global updates_started, updates_finished, updates_to_perform

    # Wait until we have performed some updates
    wait_until(20, 0.5, lambda: updates_finished > 500)

    log.info("Detaching tenant")
    pageserver_http.tenant_detach(tenant_id)
    await asyncio.sleep(1)
    log.info("Re-attaching tenant")
    pageserver_http.tenant_attach(tenant_id)
    log.info("Re-attach finished")

    # Continue with 5000 more updates
    updates_to_perform = updates_started + 5000


# async guts of test_tenant_reattach_while_bysy test
async def reattach_while_busy(
    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
):
    workers = []
    for worker_id in range(num_connections):
        pg_conn = await pg.connect_async()
        workers.append(asyncio.create_task(update_table(pg_conn)))

    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
    await asyncio.gather(*workers)

    assert updates_finished == updates_to_perform


# Detach and re-attach tenant, while compute is busy running queries.
#
# Some of the queries may fail, in the window that the tenant has been
# detached but not yet re-attached. But Postgres itself should keep
# running, and when we retry the queries, they should start working
# after the attach has finished.

# FIXME:
#
# This is pretty unstable at the moment. I've seen it fail with a warning like this:
#
# AssertionError: assert not ['2023-01-05T13:09:40.708303Z  WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
#
# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
# I believe that failure happened because there is a race condition
# between detach and starting remote upload tasks:
#
# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
#    signal to all in-progress tasks associated with the tenant.
# 2. Just after shutdown_tasks() has collected the list of tasks,
#    a new remote-upload task is spawned.
#
# See https://github.com/neondatabase/neon/issues/3273
#
#
# I also saw this failure:
#
# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
#     asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
#     return loop.run_until_complete(main)
# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
#     return future.result()
# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
#     assert updates_finished == updates_to_perform
# E   assert 5010 == 10010
# E     +5010
# E     -10010
#
# I don't know what's causing that...
@pytest.mark.skip(reason="fixme")
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_reattach_while_busy(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_tenant_reattach_while_busy",
    )
    env = neon_env_builder.init_start()

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
    env.pageserver.allowed_errors.append(
        ".*Tenant .* will not become active\\. Current state: Stopping.*"
    )

    pageserver_http = env.pageserver.http_client()

    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant(
        # Create layers aggressively
        conf={"checkpoint_distance": "100000"}
    )

    pg = env.postgres.create_start("main", tenant_id=tenant_id)

    cur = pg.connect().cursor()

    cur.execute("CREATE TABLE t(id int primary key, counter int)")
    cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")

    # Run the test
    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))

    # Verify table contents
    assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
    assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform


def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")

    # first check for non existing tenant
    tenant_id = TenantId.generate()
    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"Tenant not found for id {tenant_id}",
    ):
        pageserver_http.tenant_detach(tenant_id)

    # the error will be printed to the log too
    env.pageserver.allowed_errors.append(".*Tenant not found for id.*")

    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

    # assert tenant exists on disk
    assert (env.repo_dir / "tenants" / str(tenant_id)).exists()

    pg = env.postgres.create_start("main", tenant_id=tenant_id)
    # we rely upon autocommit after each statement
    pg.safe_psql_many(
        queries=[
            "CREATE TABLE t(key int primary key, value text)",
            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
        ]
    )

    # gc should not try to even start on a timeline that doesn't exist
    with pytest.raises(
        expected_exception=PageserverApiException, match="gc target timeline does not exist"
    ):
        bogus_timeline_id = TimelineId.generate()
        pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)

    # the error will be printed to the log too
    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")

    # Detach while running manual GC.
    # It should wait for manual GC to finish because it runs in a task associated with the tenant.
    pageserver_http.configure_failpoints(
        ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
    )
    gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
    gc_thread.start()
    time.sleep(1)
    # By now the gc task is spawned but in sleep for another second due to the failpoint.

    log.info("detaching tenant")
    pageserver_http.tenant_detach(tenant_id)
    log.info("tenant detached without error")

    log.info("wait for gc thread to return")
    gc_thread.join(timeout=10)
    assert not gc_thread.is_alive()
    log.info("gc thread returned")

    # check that nothing is left on disk for deleted tenant
    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()

    with pytest.raises(
        expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found"
    ):
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)


#
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_detach_while_attaching(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_detach_while_attaching",
    )

    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

    client = env.pageserver.http_client()

    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
    # from shared_buffers without hitting the page server, which defeats the point
    # of this test.
    with pg.cursor() as cur:
        cur.execute("CREATE TABLE foo (t text)")
        cur.execute(
            """
            INSERT INTO foo
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g
            """
        )
        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

    # wait until pageserver receives that data
    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)

    # run checkpoint manually to be sure that data landed in remote storage
    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)

    log.info("waiting for upload")

    # wait until pageserver successfully uploaded a checkpoint to remote storage
    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
    log.info("upload is done")

    # Detach it
    pageserver_http.tenant_detach(tenant_id)

    # And re-attach
    pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])

    pageserver_http.tenant_attach(tenant_id)

    # Before it has chance to finish, detach it again
    pageserver_http.tenant_detach(tenant_id)

    # is there a better way to assert that failpoint triggered?
    time.sleep(10)

    # Attach it again. If the GC and compaction loops from the previous attach/detach
    # cycle are still running, things could get really confusing..
    pageserver_http.tenant_attach(tenant_id)

    with pg.cursor() as cur:
        cur.execute("SELECT COUNT(*) FROM foo")


# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
# * verify the ignored tenant is gone from pageserver's memory
# * restart the pageserver and verify that ignored tenant is still not loaded
# * `load` the same tenant
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
def test_ignored_tenant_reattach(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_remote_storage_backup_and_restore",
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    ignored_tenant_id, _ = env.neon_cli.create_tenant()
    tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id)
    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_before_ignore.sort()
    timelines_before_ignore = [
        timeline["timeline_id"]
        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
    ]
    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]

    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
    pageserver_http.tenant_ignore(ignored_tenant_id)

    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
    assert (
        len(disappeared_files) == 0
    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
    assert (
        len(new_files) == 1
    ), f"Only tenant ignore file should appear on disk but got: {new_files}"

    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
    assert len(tenants_after_ignore) + 1 == len(
        tenants_before_ignore
    ), "Only ignored tenant should be missing"

    # restart the pageserver to ensure we don't load the ignore timeline
    env.pageserver.stop()
    env.pageserver.start()
    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_after_restart.sort()
    assert (
        tenants_after_restart == tenants_after_ignore
    ), "Ignored tenant should not be reloaded after pageserver restart"

    # now, load it from the local files and expect it works
    pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)

    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_after_attach.sort()
    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"

    timelines_after_ignore = [
        timeline["timeline_id"]
        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
    ]
    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"


# Tests that it's possible to `load` tenants with missing layers and get them restored:
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * removes all timeline's local layers
# * `load` the same tenant
# * ensure that it's status is `Active`
# * check that timeline data is restored
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignored_tenant_download_missing_layers(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_ignored_tenant_download_and_attach",
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])

    data_id = 1
    data_secret = "very secret secret"
    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)

    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_before_ignore.sort()
    timelines_before_ignore = [
        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
    ]

    # ignore the tenant and remove its layers
    pageserver_http.tenant_ignore(tenant_id)
    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    layers_removed = False
    for dir_entry in tenant_timeline_dir.iterdir():
        if dir_entry.name.startswith("00000"):
            # Looks like a layer file. Remove it
            dir_entry.unlink()
            layers_removed = True
    assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"

    # now, load it from the local files and expect it to work due to remote storage restoration
    pageserver_http.tenant_load(tenant_id=tenant_id)
    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)

    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_after_attach.sort()
    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"

    timelines_after_ignore = [
        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
    ]
    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"

    pg.stop()
    pg.start()
    ensure_test_data(data_id, data_secret, pg)


# Tests that it's possible to `load` broken tenants:
# * `ignore` a tenant
# * removes its `metadata` file locally
# * `load` the same tenant
# * ensure that it's status is `Broken`
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignored_tenant_stays_broken_without_metadata(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_ignored_tenant_stays_broken_without_metadata",
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])

    # ignore the tenant and remove its metadata
    pageserver_http.tenant_ignore(tenant_id)
    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    metadata_removed = False
    for dir_entry in tenant_timeline_dir.iterdir():
        if dir_entry.name == "metadata":
            # Looks like a layer file. Remove it
            dir_entry.unlink()
            metadata_removed = True
    assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"

    env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")

    # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
    pageserver_http.tenant_load(tenant_id=tenant_id)
    wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5)


# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_load_attach_negatives(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_load_attach_negatives",
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])

    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"tenant {tenant_id} already exists, state: Active",
    ):
        pageserver_http.tenant_load(tenant_id)

    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"tenant {tenant_id} already exists, state: Active",
    ):
        pageserver_http.tenant_attach(tenant_id)

    pageserver_http.tenant_ignore(tenant_id)

    env.pageserver.allowed_errors.append(
        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
    )
    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
    ):
        pageserver_http.tenant_attach(tenant_id)


@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignore_while_attaching(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_ignore_while_attaching",
    )

    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

    pageserver_http = env.pageserver.http_client()

    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])

    data_id = 1
    data_secret = "very secret secret"
    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)

    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]

    # Detach it
    pageserver_http.tenant_detach(tenant_id)
    # And re-attach, but stop attach task_mgr task from completing
    pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
    pageserver_http.tenant_attach(tenant_id)
    # Run ignore on the task, thereby cancelling the attach.
    # XXX This should take priority over attach, i.e., it should cancel the attach task.
    # But neither the failpoint, nor the proper storage_sync download functions,
    # are sensitive to task_mgr::shutdown.
    # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
    # So, for now, effectively, this ignore here will block until attach task completes.
    pageserver_http.tenant_ignore(tenant_id)

    # Cannot attach it due to some local files existing
    env.pageserver.allowed_errors.append(
        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
    )
    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
    ):
        pageserver_http.tenant_attach(tenant_id)

    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
    assert len(tenants_after_ignore) + 1 == len(
        tenants_before_ignore
    ), "Only ignored tenant should be missing"

    # But can load it from local files, that will restore attach.
    pageserver_http.tenant_load(tenant_id)

    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)

    pg.stop()
    pg.start()
    ensure_test_data(data_id, data_secret, pg)


def insert_test_data(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    data_id: int,
    data: str,
    pg: Postgres,
):
    with pg.cursor() as cur:
        cur.execute(
            f"""
            CREATE TABLE test(id int primary key, secret text);
            INSERT INTO test VALUES ({data_id}, '{data}');
        """
        )
        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

    # wait until pageserver receives that data
    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)

    # run checkpoint manually to be sure that data landed in remote storage
    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)

    # wait until pageserver successfully uploaded a checkpoint to remote storage
    log.info("waiting for to be ignored tenant data checkpoint upload")
    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)


def ensure_test_data(data_id: int, data: str, pg: Postgres):
    with pg.cursor() as cur:
        assert (
            query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
        ), "Should have timeline data back"