eviction: regression test + distinguish layer write from map insert (#4005)

This patch adds a regression test for the threshold-based layer eviction. The test asserts the basic invariant that, if left alone, the residence statuses will stabilize, with some layers resident and some layers evicted. Thereby, we cover both the aspect of last-access-time-threshold-based eviction, and the "imitate access" hacks that we put in recently. The aggressive `period` and `threshold` values revealed a subtle bug which is also fixed in this patch. The symptom was that, without the Rust changes of this patch, there would be occasional test failures due to `WARN... unexpectedly downloading` log messages. These log messages were caused by the "imitate access" calls of the eviction task. But, the whole point of the "imitate access" hack was to prevent eviction of the layers that we access there. After some digging, I found the root cause, which is the following race condition: 1. Compact: Write out an L1 layer from several L0 layers. This records residence event `LayerCreate` with the current timestamp. 2. Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. 3. Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. 4. Eviction: observes the new L1 layer whose only activity timestamp is the `LayerCreate` event. The L1 layer had no chance of being accessed until after (3). So, if enough time passes between (1) and (3), then (4) will observe a layer with `now-last_activity > threshold` and evict it The fix is to require the first `record_residence_event` to happen while we already hold the layer map lock. The API requires a ref to a `BatchedUpdates` as a witness that we are inside a layer map lock. That is not fool-proof, e.g., new call sites for `insert_historic` could just completely forget to record the residence event. It would be nice to prevent this at the type level. In the meantime, we have a rate-limited log messages to warn us, if such an implementation error sneaks in in the future. fixes https://github.com/neondatabase/neon/issues/3593 fixes https://github.com/neondatabase/neon/issues/3942 --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2026-01-07 21:42:56 +00:00 · 2023-05-04 16:16:48 +02:00
parent b5d64a1e32
commit 7dd9553bbb
11 changed files with 415 additions and 64 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -292,6 +292,12 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)


+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
@pytest.fixture(scope="function")
 def default_broker(
    port_distributor: PortDistributor,
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -24,13 +24,6 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

-
-@pytest.fixture(scope="session")
-def httpserver_listen_address(port_distributor: PortDistributor):
-    port = port_distributor.get_port()
-    return ("localhost", port)
-
-
 # ==============================================================================
 # Storage metrics tests
 # ==============================================================================
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -0,0 +1,179 @@
+import time
+from dataclasses import dataclass
+from typing import List, Set, Tuple
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    RemoteStorageKind,
+    last_flush_lsn_upload,
+)
+from fixtures.pageserver.http import LayerMapInfo
+from fixtures.types import TimelineId
+from pytest_httpserver import HTTPServer
+
+# NB: basic config change tests are in test_tenant_conf.py
+
+
+def test_threshold_based_eviction(
+    request,
+    httpserver: HTTPServer,
+    httpserver_listen_address,
+    pg_bin: PgBin,
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
+
+    # Start with metrics collection enabled, so that the eviction task
+    # imitates its accesses. We'll use a non-existent endpoint to make it fail.
+    # The synthetic size calculation will run regardless.
+    host, port = httpserver_listen_address
+    neon_env_builder.pageserver_config_override = f"""
+        metric_collection_interval="1s"
+        synthetic_size_calculation_interval="2s"
+        metric_collection_endpoint="http://{host}:{port}/nonexistent"
+    """
+    metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*"
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(metrics_refused_log_line)
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
+    assert isinstance(timeline_id, TimelineId)
+
+    ps_http = env.pageserver.http_client()
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "NoEviction"
+    }
+
+    eviction_threshold = 5
+    eviction_period = 1
+    ps_http.set_tenant_config(
+        tenant_id,
+        {
+            "eviction_policy": {
+                "kind": "LayerAccessThreshold",
+                "threshold": f"{eviction_threshold}s",
+                "period": f"{eviction_period}s",
+            },
+        },
+    )
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "threshold": f"{eviction_threshold}s",
+        "period": f"{eviction_period}s",
+    }
+
+    # restart because changing tenant config is not instant
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "threshold": f"{eviction_threshold}s",
+        "period": f"{eviction_period}s",
+    }
+
+    # create a bunch of L1s, only the least of which will need to be resident
+    compaction_threshold = 3  # create L1 layers quickly
+    ps_http.patch_tenant_config_client_side(
+        tenant_id,
+        inserts={
+            # Disable gc and compaction to avoid on-demand downloads from their side.
+            # The only on-demand downloads should be from the eviction tasks's "imitate access" functions.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # low checkpoint_distance so that pgbench creates many layers
+            "checkpoint_distance": 1024**2,
+            # Low compaction target size to create many L1's with tight key ranges.
+            # This is so that the "imitate access" don't download all the layers.
+            "compaction_target_size": 1 * 1024**2,  # all keys into one L1
+            # Turn L0's into L1's fast.
+            "compaction_threshold": compaction_threshold,
+            # Prevent compaction from collapsing L1 delta layers into image layers. We want many layers here.
+            "image_creation_threshold": 100,
+            # Much larger so that synthetic size caluclation worker, which is part of metric collection,
+            # computes logical size for initdb_lsn every time, instead of some moving lsn as we insert data.
+            # This makes the set of downloaded layers predictable,
+            # thereby allowing the residence statuses to stabilize below.
+            "gc_horizon": 1024**4,
+        },
+    )
+
+    # create a bunch of layers
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as pg:
+        pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()])
+        last_flush_lsn_upload(env, pg, tenant_id, timeline_id)
+    # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint
+    for sk in env.safekeepers:
+        sk.stop()
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait for evictions and assert that they stabilize
+    @dataclass
+    class ByLocalAndRemote:
+        remote_layers: Set[str]
+        local_layers: Set[str]
+
+    class MapInfoProjection:
+        def __init__(self, info: LayerMapInfo):
+            self.info = info
+
+        def by_local_and_remote(self) -> ByLocalAndRemote:
+            return ByLocalAndRemote(
+                remote_layers={
+                    layer.layer_file_name for layer in self.info.historic_layers if layer.remote
+                },
+                local_layers={
+                    layer.layer_file_name for layer in self.info.historic_layers if not layer.remote
+                },
+            )
+
+        def __eq__(self, other):
+            if not isinstance(other, MapInfoProjection):
+                return False
+            return self.by_local_and_remote() == other.by_local_and_remote()
+
+        def __repr__(self) -> str:
+            out = ["MapInfoProjection:"]
+            for layer in sorted(self.info.historic_layers, key=lambda layer: layer.layer_file_name):
+                remote = "R" if layer.remote else "L"
+                out += [f"  {remote} {layer.layer_file_name}"]
+            return "\n".join(out)
+
+    observation_window = 8 * eviction_threshold
+    consider_stable_when_no_change_for_seconds = 3 * eviction_threshold
+    poll_interval = eviction_threshold / 3
+    started_waiting_at = time.time()
+    map_info_changes: List[Tuple[float, MapInfoProjection]] = []
+    while time.time() - started_waiting_at < observation_window:
+        current = (
+            time.time(),
+            MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)),
+        )
+        last = map_info_changes[-1] if map_info_changes else (0, None)
+        if last[1] is None or current[1] != last[1]:
+            map_info_changes.append(current)
+            log.info("change in layer map\n before: %s\n after: %s", last, current)
+        else:
+            stable_for = current[0] - last[0]
+            log.info("residencies stable for %s", stable_for)
+            if stable_for > consider_stable_when_no_change_for_seconds:
+                break
+        time.sleep(poll_interval)
+
+    log.info("len(map_info_changes)=%s", len(map_info_changes))
+
+    # TODO: can we be more precise here? E.g., require we're stable _within_ X*threshold,
+    # instead of what we do here, i.e., stable _for at least_ X*threshold toward the end of the observation window
+    assert (
+        stable_for > consider_stable_when_no_change_for_seconds
+    ), "layer residencies did not become stable within the observation window"
+
+    post = map_info_changes[-1][1].by_local_and_remote()
+    assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
+    assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
+
+    assert env.pageserver.log_contains(
+        metrics_refused_log_line
+    ), "ensure the metrics collection worker ran"