import concurrent.futures
import random
import time

import pytest
from collections import defaultdict
from fixtures.common_types import TenantId, TenantShardId, TimelineId
from fixtures.compute_reconfigure import ComputeReconfigure
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
    NeonEnvBuilder, StorageControllerApiException, NeonEnv
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pg_version import PgVersion
from typing import Dict

def get_consistent_node_shard_counts(env: NeonEnv, total_shards):
    tenants = env.storage_controller.tenant_list()

    intent = dict()
    observed = dict()

    tenant_placement: defaultdict[str, Dict] = defaultdict(lambda: {"observed": {"attached": None, "secondary": []}, "intent": {"attached": None, "secondary": []}})

    for t in tenants:
        for node_id, loc_state in t["observed"]["locations"].items():
            if (
                loc_state is not None
                and "conf" in loc_state
                and loc_state["conf"] is not None
                and loc_state["conf"]["mode"] in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
            ):
                observed[t["tenant_shard_id"]] = int(node_id)
                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)

            if (
                loc_state is not None
                and "conf" in loc_state
                and loc_state["conf"] is not None
                and loc_state["conf"]["mode"] == "Secondary"
            ):
                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))

        if "attached" in t["intent"]:
            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]

        if "secondary" in t["intent"]:
            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"]["secondary"]

    log.info(f"{tenant_placement=}")

    matching = {tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]}
    assert len(matching) == total_shards

    attached_per_node: defaultdict[str,  int] = defaultdict(int)
    for node_id in matching.values():
        attached_per_node[node_id] += 1

    return attached_per_node

def assert_consistent_balanced_attachments(env: NeonEnv, total_shards):
    attached_per_node = get_consistent_node_shard_counts(env, total_shards)

    min_shard_count = min(attached_per_node.values())
    max_shard_count = max(attached_per_node.values())

    flake_factor = 5 / 100
    assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)

def retryable_node_operation(op, ps_id, max_attempts, backoff):
    while max_attempts > 0:
        try:
            op(ps_id)
            return
        except StorageControllerApiException as e:
            max_attempts -= 1
            log.info(f"Operation failed ({max_attempts} attempts left): {e}")

            if max_attempts == 0:
                raise e

            time.sleep(backoff)

def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
    while max_attempts > 0:
        try:
            status = env.storage_controller.node_status(node_id)
            policy = status["scheduling"]
            if policy == desired_scheduling_policy:
                return
            else:
                max_attempts -= 1
                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")

                if max_attempts == 0:
                    raise AssertionError(
                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
                    )

                time.sleep(backoff)
        except StorageControllerApiException as e:
            max_attempts -= 1
            log.info(f"Status call failed ({max_attempts} retries left): {e}")

            if max_attempts == 0:
                raise e

            time.sleep(backoff)

def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
    # Assert that all nodes have some attached shards
    assert len(shard_counts) == len(env.pageservers)

    min_shard_count = min(shard_counts.values())
    max_shard_count = max(shard_counts.values())

    flake_factor = 5 / 100
    assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)

@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
def test_storage_controller_many_tenants(
    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
):
    """
    Check that we cope well with a not-totally-trivial number of tenants.

    This is checking for:
    - Obvious concurrency bugs from issuing many tenant creations/modifications
      concurrently.
    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
      a basic test starts failing from slowness.

    This is _not_ a comprehensive scale test: just a basic sanity check that
    we don't fall over for a thousand shards.
    """

    neon_env_builder.num_pageservers = 5
    neon_env_builder.storage_controller_config = {
        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
        # guard against regressions in restart time.
        "max_unavailable": "300s"
    }
    neon_env_builder.control_plane_compute_hook_api = (
        compute_reconfigure_listener.control_plane_compute_hook_api
    )

    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))

    env = neon_env_builder.init_start()

    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
    # of shards are hitting the delayed path.
    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")

    # TODO: explain
    env.storage_controller.allowed_errors.append(".*Scheduling error when draining pageserver.*")

    for ps in env.pageservers:
        # This can happen because when we do a loop over all pageservers and mark them offline/active,
        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
        # bumping generation before other attachments are detached.
        #
        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
        # we spawn with a wait for the predecessor.
        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

        # Storage controller is allowed to drop pageserver requests when the cancellation token
        # for a Reconciler fires.
        ps.allowed_errors.append(".*request was dropped before completing.*")

    # Total tenants
    tenant_count = 4000

    # Shards per tenant
    shard_count = 2
    stripe_size = 1024

    total_shards = tenant_count * shard_count + 1

    tenants = set(TenantId.generate() for _i in range(0, tenant_count))

    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)

    def check_memory():
        # Shards should be cheap_ in memory, as we will have very many of them
        expect_memory_per_shard = 128 * 1024

        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
        assert rss is not None
        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
        assert rss < expect_memory_per_shard * shard_count * tenant_count

    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
    rng = random.Random(1234)

    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
    # permits, to ensure that we are exercising stressing that.
    api_concurrency = 135

    # We will create tenants directly via API, not via neon_local, to avoid any false
    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
        futs = []
        t1 = time.time()
        for tenant_id in tenants:
            f = executor.submit(
                env.storage_controller.tenant_create,
                tenant_id,
                shard_count,
                stripe_size,
                # Upload heatmaps fast, so that secondary downloads happen promptly, enabling
                # the controller's optimization migrations to proceed promptly.
                # TODO: update other test with this and use reconcile_till_idle
                tenant_config={"heatmap_period": "10s"},
                placement_policy={"Attached": 1},
            )
            futs.append(f)

        # Wait for creations to finish
        for f in futs:
            f.result()
        log.info(
            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
        )

        run_ops = api_concurrency * 4
        assert run_ops < len(tenants)
        op_tenants = list(tenants)[0:run_ops]

        # Generate a mixture of operations and dispatch them all concurrently
        futs = []
        for tenant_id in op_tenants:
            op = rng.choice([0, 1, 2])
            if op == 0:
                # A fan-out write operation to all shards in a tenant (timeline creation)
                f = executor.submit(
                    virtual_ps_http.timeline_create,
                    PgVersion.NOT_SET,
                    tenant_id,
                    TimelineId.generate(),
                )
            elif op == 1:
                # A reconciler operation: migrate a shard.
                shard_number = rng.randint(0, shard_count - 1)
                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
                f = executor.submit(
                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                )
            elif op == 2:
                # A passthrough read to shard zero
                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)

            futs.append(f)

        # Wait for mixed ops to finish
        for f in futs:
            f.result()

    # Consistency check is safe here: all the previous operations waited for reconcile before completing
    env.storage_controller.consistency_check()
    check_memory()

    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
    #
    # We do not require that the system is quiescent already here, although at present in this point in the test
    # that may be the case.
    while True:
        t1 = time.time()
        reconcilers = env.storage_controller.reconcile_all()
        if reconcilers == 0:
            # Time how long a no-op background reconcile takes: this measures how long it takes to
            # loop over all the shards looking for work to do.
            runtime = time.time() - t1
            log.info(f"No-op call to reconcile_all took {runtime}s")
            assert runtime < 1
            break

    # Restart the storage controller
    env.storage_controller.stop()
    env.storage_controller.start()

    # See how long the controller takes to pass its readiness check.  This should be fast because
    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
    # startup.
    readiness_period = env.storage_controller.wait_until_ready()
    assert readiness_period < 5

    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
    # to run, as it was in a stable state before restart.  If it did, that's a bug.
    env.storage_controller.consistency_check()
    check_memory()

    shard_counts = get_consistent_node_shard_counts(env, total_shards)
    log.info(f"Shard counts before rolling restart: {shard_counts}")

    # Restart pageservers: this exercises the /re-attach API
    for ps in env.pageservers:
        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
        )
        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=24, backoff=5)

        shard_counts = get_consistent_node_shard_counts(env, total_shards)
        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
        # Assert that we've drained the node
        assert shard_counts[str(ps.id)] == 0
        # Assert that those shards actually went somewhere
        assert sum(shard_counts.values()) == total_shards

        ps.restart()
        poll_node_status(env, ps.id, "Active", max_attempts=24, backoff=1)

        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
        )
        poll_node_status(env, ps.id, "Active", max_attempts=24, backoff=5)

        shard_counts = get_consistent_node_shard_counts(env, total_shards)
        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")

        assert_consistent_balanced_attachments(env, total_shards)

        env.storage_controller.reconcile_until_idle()
        env.storage_controller.consistency_check()

    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
    # as they were not offline long enough to trigger any scheduling changes.
    env.storage_controller.consistency_check()
    check_memory()

    # Stop the storage controller before tearing down fixtures, because it otherwise might log
    # errors trying to call our `ComputeReconfigure`.
    env.storage_controller.stop()