neon/test_runner/regress/test_storage_controller.py

import json
import time
from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, Dict, List, Union

import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
    StorageControllerApiException,
    TokenScope,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    enable_remote_storage_versioning,
    list_prefix,
    remote_storage_delete_key,
    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.types import TenantId, TenantShardId, TimelineId
from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
from mypy_boto3_s3.type_defs import (
    ObjectTypeDef,
)
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response


def get_node_shard_counts(env: NeonEnv, tenant_ids):
    counts: defaultdict[str, int] = defaultdict(int)
    for tid in tenant_ids:
        for shard in env.storage_controller.locate(tid):
            counts[shard["node_id"]] += 1
    return counts


def test_storage_controller_smoke(
    neon_env_builder: NeonEnvBuilder,
):
    """
    Test the basic lifecycle of a storage controller:
    - Restarting
    - Restarting a pageserver
    - Creating and deleting tenants and timelines
    - Marking a pageserver offline
    """

    neon_env_builder.num_pageservers = 3
    env = neon_env_builder.init_configs()

    for pageserver in env.pageservers:
        # This test detaches tenants during migration, which can race with deletion queue operations,
        # during detach we only do an advisory flush, we don't wait for it.
        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])

    # Start services by hand so that we can skip a pageserver (this will start + register later)
    env.broker.try_start()
    env.storage_controller.start()
    env.pageservers[0].start()
    env.pageservers[1].start()
    for sk in env.safekeepers:
        sk.start()

    # The pageservers we started should have registered with the sharding service on startup
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2
    assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}

    # Starting an additional pageserver should register successfully
    env.pageservers[2].start()
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 3
    assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}

    # Use a multiple of pageservers to get nice even number of shards on each one
    tenant_shard_count = len(env.pageservers) * 4
    tenant_count = len(env.pageservers) * 2
    shards_per_tenant = tenant_shard_count // tenant_count
    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))

    # Creating several tenants should spread out across the pageservers
    for tid in tenant_ids:
        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)

    # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
    env.storage_controller.tenant_create(
        tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant
    )

    for node_id, count in get_node_shard_counts(env, tenant_ids).items():
        # we used a multiple of pagservers for the total shard count,
        # so expect equal number on all pageservers
        assert count == tenant_shard_count / len(
            env.pageservers
        ), f"Node {node_id} has bad count {count}"

    # Creating and deleting timelines should work, using identical API to pageserver
    timeline_crud_tenant = next(iter(tenant_ids))
    timeline_id = TimelineId.generate()
    env.storage_controller.pageserver_api().timeline_create(
        pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
    )
    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
    assert len(timelines) == 2
    assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
    #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
    timeline_delete_wait_completed(
        env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id
    )
    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
    assert len(timelines) == 1
    assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)

    # Marking a pageserver offline should migrate tenants away from it.
    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})

    def node_evacuated(node_id: int) -> None:
        counts = get_node_shard_counts(env, tenant_ids)
        assert counts[node_id] == 0

    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))

    # Marking pageserver active should not migrate anything to it
    # immediately
    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})
    time.sleep(1)
    assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0

    # Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
    before_restart = env.pageservers[1].http_client().tenant_list_locations()
    env.pageservers[1].stop()
    env.pageservers[1].start()
    after_restart = env.pageservers[1].http_client().tenant_list_locations()
    assert len(after_restart) == len(before_restart)

    # Locations should be the same before & after restart, apart from generations
    for _shard_id, tenant in after_restart["tenant_shards"]:
        del tenant["generation"]
    for _shard_id, tenant in before_restart["tenant_shards"]:
        del tenant["generation"]
    assert before_restart == after_restart

    # Delete all the tenants
    for tid in tenant_ids:
        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)

    env.storage_controller.consistency_check()

    # Set a scheduling policy on one node, create all the tenants, observe
    # that the scheduling policy is respected.
    env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})

    # Create some fresh tenants
    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
    for tid in tenant_ids:
        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)

    counts = get_node_shard_counts(env, tenant_ids)
    # Nothing should have been scheduled on the node in Draining
    assert counts[env.pageservers[1].id] == 0
    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
    assert counts[env.pageservers[2].id] == tenant_shard_count // 2

    env.storage_controller.consistency_check()


def test_node_status_after_restart(
    neon_env_builder: NeonEnvBuilder,
):
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start()

    # Initially we have two online pageservers
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2

    env.pageservers[1].stop()
    env.storage_controller.allowed_errors.extend([".*Could not scan node"])

    env.storage_controller.stop()
    env.storage_controller.start()

    def is_ready():
        assert env.storage_controller.ready() is True

    wait_until(30, 1, is_ready)

    # We loaded nodes from database on restart
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2

    # We should still be able to create a tenant, because the pageserver which is still online
    # should have had its availabilty state set to Active.
    env.storage_controller.tenant_create(TenantId.generate())

    env.storage_controller.consistency_check()


def test_storage_controller_passthrough(
    neon_env_builder: NeonEnvBuilder,
):
    """
    For simple timeline/tenant GET APIs that don't require coordination across
    shards, the sharding service implements a proxy to shard zero.  This test
    calls those APIs.
    """
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start()

    # We will talk to storage controller as if it was a pageserver, using the pageserver
    # HTTP client
    client = PageserverHttpClient(env.storage_controller_port, lambda: True)
    timelines = client.timeline_list(tenant_id=env.initial_tenant)
    assert len(timelines) == 1

    status = client.tenant_status(env.initial_tenant)
    assert TenantId(status["id"]) == env.initial_tenant
    assert set(TimelineId(t) for t in status["timelines"]) == {
        env.initial_timeline,
    }
    assert status["state"]["slug"] == "Active"

    env.storage_controller.consistency_check()


def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    tenant_a = env.initial_tenant
    tenant_b = TenantId.generate()
    env.storage_controller.tenant_create(tenant_b)
    env.pageserver.tenant_detach(tenant_a)

    # TODO: extend this test to use multiple pageservers, and check that locations don't move around
    # on restart.

    # Storage controller restart
    env.storage_controller.stop()
    env.storage_controller.start()

    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())

    # Tenant A should still be attached
    assert tenant_a not in observed

    # Tenant B should remain detached
    assert tenant_b in observed

    # Pageserver restart
    env.pageserver.stop()
    env.pageserver.start()

    # Same assertions as above: restarting either service should not perturb things
    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
    assert tenant_a not in observed
    assert tenant_b in observed

    env.storage_controller.consistency_check()


@pytest.mark.parametrize("warm_up", [True, False])
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
    """
    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
    which provides the /location_config API.  This is similar to creating a tenant,
    but imports the generation number.
    """

    neon_env_builder.num_pageservers = 2

    # Start services by hand so that we can skip registration on one of the pageservers
    env = neon_env_builder.init_configs()
    env.broker.try_start()
    env.storage_controller.start()

    # This is the pageserver where we'll initially create the tenant.  Run it in emergency
    # mode so that it doesn't talk to storage controller, and do not register it.
    env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
    env.pageservers[0].start(
        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
    )
    origin_ps = env.pageservers[0]

    # This is the pageserver managed by the sharding service, where the tenant
    # will be attached after onboarding
    env.pageservers[1].start()
    dest_ps = env.pageservers[1]
    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)

    for sk in env.safekeepers:
        sk.start()

    # Create a tenant directly via pageserver HTTP API, skipping the storage controller
    tenant_id = TenantId.generate()
    generation = 123
    origin_ps.http_client().tenant_create(tenant_id, generation=generation)

    # As if doing a live migration, first configure origin into stale mode
    r = origin_ps.http_client().tenant_location_conf(
        tenant_id,
        {
            "mode": "AttachedStale",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": generation,
        },
    )
    assert len(r["shards"]) == 1

    if warm_up:
        origin_ps.http_client().tenant_heatmap_upload(tenant_id)

        # We expect to be called via live migration code, which may try to configure the tenant into secondary
        # mode before attaching it.
        virtual_ps_http.tenant_location_conf(
            tenant_id,
            {
                "mode": "Secondary",
                "secondary_conf": {"warm": True},
                "tenant_conf": {},
                "generation": None,
            },
        )

        virtual_ps_http.tenant_secondary_download(tenant_id)

    # Call into storage controller to onboard the tenant
    generation += 1
    r = virtual_ps_http.tenant_location_conf(
        tenant_id,
        {
            "mode": "AttachedMulti",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": generation,
        },
    )
    assert len(r["shards"]) == 1

    # As if doing a live migration, detach the original pageserver
    origin_ps.http_client().tenant_location_conf(
        tenant_id,
        {
            "mode": "Detached",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": None,
        },
    )

    # As if doing a live migration, call into the storage controller to
    # set it to AttachedSingle: this is a no-op, but we test it because the
    # cloud control plane may call this for symmetry with live migration to
    # an individual pageserver
    r = virtual_ps_http.tenant_location_conf(
        tenant_id,
        {
            "mode": "AttachedSingle",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": generation,
        },
    )
    assert len(r["shards"]) == 1

    # We should see the tenant is now attached to the pageserver managed
    # by the sharding service
    origin_tenants = origin_ps.http_client().tenant_list()
    assert len(origin_tenants) == 0
    dest_tenants = dest_ps.http_client().tenant_list()
    assert len(dest_tenants) == 1
    assert TenantId(dest_tenants[0]["id"]) == tenant_id

    # sharding service advances generation by 1 when it first attaches.  We started
    # with a nonzero generation so this equality also proves that the generation
    # was properly carried over during onboarding.
    assert dest_tenants[0]["generation"] == generation + 1

    # The onboarded tenant should survive a restart of sharding service
    env.storage_controller.stop()
    env.storage_controller.start()

    # The onboarded tenant should surviev a restart of pageserver
    dest_ps.stop()
    dest_ps.start()

    # Having onboarded via /location_config, we should also be able to update the
    # TenantConf part of LocationConf, without inadvertently resetting the generation
    modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
    dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)

    # The generation has moved on since we onboarded
    assert generation != dest_tenant_before_conf_change["generation"]

    r = virtual_ps_http.tenant_location_conf(
        tenant_id,
        {
            "mode": "AttachedSingle",
            "secondary_conf": None,
            "tenant_conf": modified_tenant_conf,
            # This is intentionally a stale generation
            "generation": generation,
        },
    )
    assert len(r["shards"]) == 1
    dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
    assert (
        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
    )
    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf

    env.storage_controller.consistency_check()


def test_storage_controller_compute_hook(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
    httpserver_listen_address,
):
    """
    Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
    """

    # We will run two pageserver to migrate and check that the storage controller sends notifications
    # when migrating.
    neon_env_builder.num_pageservers = 2
    (host, port) = httpserver_listen_address
    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"

    # Set up fake HTTP notify endpoint
    notifications = []

    handle_params = {"status": 200}

    def handler(request: Request):
        status = handle_params["status"]
        log.info(f"Notify request[{status}]: {request}")
        notifications.append(request.json)
        return Response(status=status)

    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)

    # Start running
    env = neon_env_builder.init_start()

    # We will to an unclean migration, which will result in deletion queue warnings
    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")

    # Initial notification from tenant creation
    assert len(notifications) == 1
    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
        "tenant_id": str(env.initial_tenant),
        "stripe_size": None,
        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
    }
    assert notifications[0] == expect

    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})

    def node_evacuated(node_id: int) -> None:
        counts = get_node_shard_counts(env, [env.initial_tenant])
        assert counts[node_id] == 0

    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))

    # Additional notification from migration
    log.info(f"notifications: {notifications}")
    expect = {
        "tenant_id": str(env.initial_tenant),
        "stripe_size": None,
        "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
    }

    def received_migration_notification():
        assert len(notifications) == 2
        assert notifications[1] == expect

    wait_until(20, 0.25, received_migration_notification)

    # When we restart, we should re-emit notifications for all tenants
    env.storage_controller.stop()
    env.storage_controller.start()

    def received_restart_notification():
        assert len(notifications) == 3
        assert notifications[2] == expect

    wait_until(10, 1, received_restart_notification)

    # Splitting a tenant should cause its stripe size to become visible in the compute notification
    env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
    expect = {
        "tenant_id": str(env.initial_tenant),
        "stripe_size": 32768,
        "shards": [
            {"node_id": int(env.pageservers[1].id), "shard_number": 0},
            {"node_id": int(env.pageservers[1].id), "shard_number": 1},
        ],
    }

    def received_split_notification():
        assert len(notifications) == 4
        assert notifications[3] == expect

    wait_until(10, 1, received_split_notification)

    # If the compute hook is unavailable, that should not block creating a tenant and
    # creating a timeline.  This simulates a control plane refusing to accept notifications
    handle_params["status"] = 423
    degraded_tenant_id = TenantId.generate()
    degraded_timeline_id = TimelineId.generate()
    env.storage_controller.tenant_create(degraded_tenant_id)
    env.storage_controller.pageserver_api().timeline_create(
        PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id
    )

    # Ensure we hit the handler error path
    env.storage_controller.allowed_errors.append(
        ".*Failed to notify compute of attached pageserver.*tenant busy.*"
    )
    env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*")
    assert notifications[-1] is not None
    assert notifications[-1]["tenant_id"] == str(degraded_tenant_id)

    env.storage_controller.consistency_check()


def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
    """
    Verify that occasional-use debug APIs work as expected.  This is a lightweight test
    that just hits the endpoints to check that they don't bitrot.
    """

    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start()

    tenant_id = TenantId.generate()
    env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)

    # Check that the consistency check passes on a freshly setup system
    env.storage_controller.consistency_check()

    # These APIs are intentionally not implemented as methods on NeonStorageController, as
    # they're just for use in unanticipated circumstances.

    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
    response = env.storage_controller.request(
        "GET",
        f"{env.storage_controller_api}/debug/v1/tenant",
        headers=env.storage_controller.headers(TokenScope.ADMIN),
    )
    assert len(response.json()) == 3

    # Scheduler should report the expected nodes and shard counts
    response = env.storage_controller.request(
        "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
    )
    # Two nodes, in a dict of node_id->node
    assert len(response.json()["nodes"]) == 2
    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
    assert all(v["may_schedule"] for v in response.json()["nodes"].values())

    response = env.storage_controller.request(
        "POST",
        f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
        headers=env.storage_controller.headers(TokenScope.ADMIN),
    )
    assert len(env.storage_controller.node_list()) == 1

    response = env.storage_controller.request(
        "POST",
        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
        headers=env.storage_controller.headers(TokenScope.ADMIN),
    )

    # Tenant drop should be reflected in dump output
    response = env.storage_controller.request(
        "GET",
        f"{env.storage_controller_api}/debug/v1/tenant",
        headers=env.storage_controller.headers(TokenScope.ADMIN),
    )
    assert len(response.json()) == 1

    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
    # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
    env.storage_controller.consistency_check()


def test_storage_controller_s3_time_travel_recovery(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
):
    """
    Test for S3 time travel
    """

    remote_storage_kind = s3_storage()
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    # Mock S3 doesn't have versioning enabled by default, enable it
    # (also do it before there is any writes to the bucket)
    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
        remote_storage = neon_env_builder.pageserver_remote_storage
        assert remote_storage, "remote storage not configured"
        enable_remote_storage_versioning(remote_storage)

    neon_env_builder.num_pageservers = 1

    env = neon_env_builder.init_start()
    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)

    tenant_id = TenantId.generate()
    env.storage_controller.tenant_create(
        tenant_id,
        shard_count=2,
        shard_stripe_size=8192,
        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
    )

    # Check that the consistency check passes
    env.storage_controller.consistency_check()

    branch_name = "main"
    timeline_id = env.neon_cli.create_timeline(
        branch_name,
        tenant_id=tenant_id,
    )
    # Write some nontrivial amount of data into the endpoint and wait until it is uploaded
    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        run_pg_bench_small(pg_bin, endpoint.connstr())
        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
        # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)

    # Give the data time to be uploaded
    time.sleep(4)

    # Detach the tenant
    virtual_ps_http.tenant_location_conf(
        tenant_id,
        {
            "mode": "Detached",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": None,
        },
    )

    time.sleep(4)
    ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
    time.sleep(4)

    # Simulate a "disaster": delete some random files from remote storage for one of the shards
    assert env.pageserver_remote_storage
    shard_id_for_list = "0002"
    objects: List[ObjectTypeDef] = list_prefix(
        env.pageserver_remote_storage,
        f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/",
    ).get("Contents", [])
    assert len(objects) > 1
    log.info(f"Found {len(objects)} objects in remote storage")
    should_delete = False
    for obj in objects:
        obj_key = obj["Key"]
        should_delete = not should_delete
        if not should_delete:
            log.info(f"Keeping key on remote storage: {obj_key}")
            continue
        log.info(f"Deleting key from remote storage: {obj_key}")
        remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
        pass

    time.sleep(4)
    ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
    time.sleep(4)

    # Do time travel recovery
    virtual_ps_http.tenant_time_travel_remote_storage(
        tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2]
    )
    time.sleep(4)

    # Attach the tenant again
    virtual_ps_http.tenant_location_conf(
        tenant_id,
        {
            "mode": "AttachedSingle",
            "secondary_conf": None,
            "tenant_conf": {},
            "generation": 100,
        },
    )

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        endpoint.safe_psql("SELECT * FROM created_foo;")

    env.storage_controller.consistency_check()


def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
    svc = env.storage_controller
    api = env.storage_controller_api

    tenant_id = TenantId.generate()
    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}

    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
    env.storage_controller.allowed_errors.append(".*Forbidden.*")

    # No token
    with pytest.raises(
        StorageControllerApiException,
        match="Unauthorized: missing authorization header",
    ):
        svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body)

    # Token with incorrect scope
    with pytest.raises(
        StorageControllerApiException,
        match="Forbidden: JWT authentication error",
    ):
        svc.request(
            "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA)
        )

    # Token with correct scope
    svc.request(
        "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
    )

    # Token with admin scope should also be permitted
    svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))

    # No token
    with pytest.raises(
        StorageControllerApiException,
        match="Unauthorized: missing authorization header",
    ):
        svc.request("GET", f"{api}/debug/v1/tenant")

    # Token with incorrect scope
    with pytest.raises(
        StorageControllerApiException,
        match="Forbidden: JWT authentication error",
    ):
        svc.request(
            "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API)
        )

    # No token
    with pytest.raises(
        StorageControllerApiException,
        match="Unauthorized: missing authorization header",
    ):
        svc.request("POST", f"{api}/upcall/v1/re-attach")

    # Token with incorrect scope
    with pytest.raises(
        StorageControllerApiException,
        match="Forbidden: JWT authentication error",
    ):
        svc.request(
            "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
        )


def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
    """
    Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
    supplying the whole LocationConf.
    """

    env = neon_env_builder.init_start()
    tenant_id = env.initial_tenant

    http = env.storage_controller.pageserver_api()

    default_value = "7days"
    new_value = "1h"
    http.set_tenant_config(tenant_id, {"pitr_interval": new_value})

    # Ensure the change landed on the storage controller
    readback_controller = http.tenant_config(tenant_id)
    assert readback_controller.effective_config["pitr_interval"] == new_value
    assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value

    # Ensure the change made it down to the pageserver
    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
    assert readback_ps.effective_config["pitr_interval"] == new_value
    assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value

    # Omitting a value clears it.  This looks different in storage controller
    # vs. pageserver API calls, because pageserver has defaults.
    http.set_tenant_config(tenant_id, {})
    readback_controller = http.tenant_config(tenant_id)
    assert readback_controller.effective_config["pitr_interval"] is None
    assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
    assert readback_ps.effective_config["pitr_interval"] == default_value
    assert "pitr_interval" not in readback_ps.tenant_specific_overrides

    env.storage_controller.consistency_check()


class Failure:
    pageserver_id: int

    def apply(self, env: NeonEnv):
        raise NotImplementedError()

    def clear(self, env: NeonEnv):
        raise NotImplementedError()


class NodeStop(Failure):
    def __init__(self, pageserver_id, immediate):
        self.pageserver_id = pageserver_id
        self.immediate = immediate

    def apply(self, env: NeonEnv):
        pageserver = env.get_pageserver(self.pageserver_id)
        pageserver.stop(immediate=self.immediate)

    def clear(self, env: NeonEnv):
        pageserver = env.get_pageserver(self.pageserver_id)
        pageserver.start()


class PageserverFailpoint(Failure):
    def __init__(self, failpoint, pageserver_id):
        self.failpoint = failpoint
        self.pageserver_id = pageserver_id

    def apply(self, env: NeonEnv):
        pageserver = env.get_pageserver(self.pageserver_id)
        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))

    def clear(self, env: NeonEnv):
        pageserver = env.get_pageserver(self.pageserver_id)
        pageserver.http_client().configure_failpoints((self.failpoint, "off"))


def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
    tenants = env.storage_controller.tenant_list()

    node_to_tenants: dict[int, list[TenantId]] = {}
    for t in tenants:
        for node_id, loc_state in t["observed"]["locations"].items():
            if (
                loc_state is not None
                and "conf" in loc_state
                and loc_state["conf"] is not None
                and loc_state["conf"]["mode"] == "AttachedSingle"
            ):
                crnt = node_to_tenants.get(int(node_id), [])
                crnt.append(TenantId(t["tenant_shard_id"]))
                node_to_tenants[int(node_id)] = crnt

    return node_to_tenants


@pytest.mark.parametrize(
    "failure",
    [
        NodeStop(pageserver_id=1, immediate=False),
        NodeStop(pageserver_id=1, immediate=True),
        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
    ],
)
def test_storage_controller_heartbeats(
    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
):
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_configs()
    env.start()

    # Default log allow list permits connection errors, but this test will use error responses on
    # the utilization endpoint.
    env.storage_controller.allowed_errors.append(
        ".*Call to node.*management API.*failed.*failpoint.*"
    )

    # Initially we have two online pageservers
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2
    assert all([n["availability"] == "Active" for n in nodes])

    # ... then we create two tenants and write some data into them
    def create_tenant(tid: TenantId):
        env.storage_controller.tenant_create(tid)

        branch_name = "main"
        env.neon_cli.create_timeline(
            branch_name,
            tenant_id=tid,
        )

        with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
            run_pg_bench_small(pg_bin, endpoint.connstr())
            endpoint.safe_psql("CREATE TABLE created_foo(id integer);")

    tenant_ids = [TenantId.generate(), TenantId.generate()]
    for tid in tenant_ids:
        create_tenant(tid)

    # ... expecting that each tenant will be placed on a different node
    def tenants_placed():
        node_to_tenants = build_node_to_tenants_map(env)
        log.info(f"{node_to_tenants=}")

        # Check that all the tenants have been attached
        assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
        # Check that each node got one tenant
        assert all((len(ts) == 1 for ts in node_to_tenants.values()))

    wait_until(10, 1, tenants_placed)

    # ... then we apply the failure
    offline_node_id = failure.pageserver_id
    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
    env.get_pageserver(offline_node_id).allowed_errors.append(
        # In the case of the failpoint failure, the impacted pageserver
        # still believes it has the tenant attached since location
        # config calls into it will fail due to being marked offline.
        ".*Dropped remote consistent LSN updates.*",
    )

    failure.apply(env)

    # ... expecting the heartbeats to mark it offline
    def node_offline():
        nodes = env.storage_controller.node_list()
        log.info(f"{nodes=}")
        target = next(n for n in nodes if n["id"] == offline_node_id)
        assert target["availability"] == "Offline"

    # A node is considered offline if the last successful heartbeat
    # was more than 10 seconds ago (hardcoded in the storage controller).
    wait_until(20, 1, node_offline)

    # .. expecting the tenant on the offline node to be migrated
    def tenant_migrated():
        node_to_tenants = build_node_to_tenants_map(env)
        log.info(f"{node_to_tenants=}")
        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)

    wait_until(10, 1, tenant_migrated)

    # ... then we clear the failure
    failure.clear(env)

    # ... expecting the offline node to become active again
    def node_online():
        nodes = env.storage_controller.node_list()
        target = next(n for n in nodes if n["id"] == offline_node_id)
        assert target["availability"] == "Active"

    wait_until(10, 1, node_online)

    time.sleep(5)

    # ... then we create a new tenant
    tid = TenantId.generate()
    env.storage_controller.tenant_create(tid)

    # ... expecting it to be placed on the node that just came back online
    tenants = env.storage_controller.tenant_list()
    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
    locations = list(newest_tenant["observed"]["locations"].keys())
    locations = [int(node_id) for node_id in locations]
    assert locations == [offline_node_id]

    # ... expecting the storage controller to reach a consistent state
    def storage_controller_consistent():
        env.storage_controller.consistency_check()

    wait_until(10, 1, storage_controller_consistent)


def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
    """
    Exercise the behavior of the /re-attach endpoint on pageserver startup when
    pageservers have a mixture of attached and secondary locations
    """

    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_configs()
    env.start()

    # We'll have two tenants.
    tenant_a = TenantId.generate()
    env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
    tenant_b = TenantId.generate()
    env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')

    # Each pageserver will have one attached and one secondary location
    env.storage_controller.tenant_shard_migrate(
        TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
    )
    env.storage_controller.tenant_shard_migrate(
        TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
    )

    # Hard-fail a pageserver
    victim_ps = env.pageservers[1]
    survivor_ps = env.pageservers[0]
    victim_ps.stop(immediate=True)

    # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
    def failed_over():
        locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
        log.info(f"locations: {locations}")
        assert len(locations) == 2
        assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)

    # We could pre-empty this by configuring the node to Offline, but it's preferable to test
    # the realistic path we would take when a node restarts uncleanly.
    # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
    wait_until(30, 1, failed_over)

    reconciles_before_restart = env.storage_controller.get_metric_value(
        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
    )

    # Restart the failed pageserver
    victim_ps.start()

    # We expect that the re-attach call correctly tipped off the pageserver that its locations
    # are all secondaries now.
    locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
    assert len(locations) == 2
    assert all(loc[1]["mode"] == "Secondary" for loc in locations)

    # We expect that this situation resulted from the re_attach call, and not any explicit
    # Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
    reconciles_after_restart = env.storage_controller.get_metric_value(
        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
    )
    assert reconciles_after_restart == reconciles_before_restart


def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
    """
    Check that emergency hooks for disabling rogue tenants' reconcilers work as expected.
    """
    env = neon_env_builder.init_configs()
    env.start()

    tenant_id = TenantId.generate()

    env.storage_controller.allowed_errors.extend(
        [
            # We will intentionally cause reconcile errors
            ".*Reconcile error.*",
            # Message from using a scheduling policy
            ".*Scheduling is disabled by policy.*",
            ".*Skipping reconcile for policy.*",
            # Message from a node being offline
            ".*Call to node .* management API .* failed",
        ]
    )

    # Stop pageserver so that reconcile cannot complete
    env.pageserver.stop()

    env.storage_controller.tenant_create(tenant_id, placement_policy="Detached")

    # Try attaching it: we should see reconciles failing
    env.storage_controller.tenant_policy_update(
        tenant_id,
        {
            "placement": {"Attached": 0},
        },
    )

    def reconcile_errors() -> int:
        return int(
            env.storage_controller.get_metric_value(
                "storage_controller_reconcile_complete_total", filter={"status": "error"}
            )
            or 0
        )

    def reconcile_ok() -> int:
        return int(
            env.storage_controller.get_metric_value(
                "storage_controller_reconcile_complete_total", filter={"status": "ok"}
            )
            or 0
        )

    def assert_errors_gt(n) -> int:
        e = reconcile_errors()
        assert e > n
        return e

    errs = wait_until(10, 1, lambda: assert_errors_gt(0))

    # Try reconciling again, it should fail again
    with pytest.raises(StorageControllerApiException):
        env.storage_controller.reconcile_all()
    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))

    # Configure the tenant to disable reconciles
    env.storage_controller.tenant_policy_update(
        tenant_id,
        {
            "scheduling": "Stop",
        },
    )

    # Try reconciling again, it should not cause an error (silently skip)
    env.storage_controller.reconcile_all()
    assert reconcile_errors() == errs

    # Start the pageserver and re-enable reconciles
    env.pageserver.start()
    env.storage_controller.tenant_policy_update(
        tenant_id,
        {
            "scheduling": "Active",
        },
    )

    def assert_ok_gt(n) -> int:
        o = reconcile_ok()
        assert o > n
        return o

    # We should see a successful reconciliation
    wait_until(10, 1, lambda: assert_ok_gt(0))

    # And indeed the tenant should be attached
    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1


def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
    """
    The storage controller command line interface (storcon-cli) is an internal tool.  Most tests
    just use the APIs directly: this test exercises some basics of the CLI as a regression test
    that the client remains usable as the server evolves.
    """
    output_dir = neon_env_builder.test_output_dir
    shard_count = 4
    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
    base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]

    def storcon_cli(args):
        """
        CLI wrapper: returns stdout split into a list of non-empty strings
        """
        (output_path, stdout, status_code) = subprocess_capture(
            output_dir,
            [str(s) for s in base_args + args],
            echo_stderr=True,
            echo_stdout=True,
            env={},
            check=False,
            capture_stdout=True,
            timeout=10,
        )
        if status_code:
            log.warning(f"Command {args} failed")
            log.warning(f"Output at: {output_path}")

            raise RuntimeError("CLI failure (check logs for stderr)")

        assert stdout is not None
        return [line.strip() for line in stdout.split("\n") if line.strip()]

    # List nodes
    node_lines = storcon_cli(["nodes"])
    # Table header, footer, and one line of data
    assert len(node_lines) == 5
    assert "localhost" in node_lines[3]

    # Pause scheduling onto a node
    storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
    assert "Pause" in storcon_cli(["nodes"])[3]

    # Make a node offline
    storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
    assert "Offline" in storcon_cli(["nodes"])[3]

    # List tenants
    tenant_lines = storcon_cli(["tenants"])
    assert len(tenant_lines) == 5
    assert str(env.initial_tenant) in tenant_lines[3]

    # Setting scheduling policies intentionally result in warnings, they're for rare use.
    env.storage_controller.allowed_errors.extend(
        [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"]
    )

    # Describe a tenant
    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
    assert len(tenant_lines) == 3 + shard_count * 2
    assert str(env.initial_tenant) in tenant_lines[3]

    # Pause changes on a tenant
    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
    assert "Stop" in storcon_cli(["tenants"])[3]

    # Change a tenant's placement
    storcon_cli(
        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
    )
    assert "Secondary" in storcon_cli(["tenants"])[3]

    # Modify a tenant's config
    storcon_cli(
        [
            "tenant-config",
            "--tenant-id",
            str(env.initial_tenant),
            "--config",
            json.dumps({"pitr_interval": "1m"}),
        ]
    )

    # Quiesce any background reconciliation before doing consistency check
    env.storage_controller.reconcile_until_idle(timeout_secs=10)
    env.storage_controller.consistency_check()