From 463b6a26b5bc1b73898d7bb75a53e3d448def97d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 25 Jan 2024 15:38:28 +0200
Subject: [PATCH] test: show relative order eviction with "fast growing tenant"
 (#6377)

Refactor out test_disk_usage_eviction tenant creation and add a custom
case with 4 tenants, 3 made with pgbench scale=1 and 1 made with pgbench
scale=4.

Because the tenants are created in order of scales [1, 1, 1, 4] this is
simple enough to demonstrate the problem with using absolute access
times, because on a disk usage based eviction run we will
disproportionally target the *first* scale=1 tenant(s), and the later
larger tenant does not lose anything.

This test is not enough to show the difference between `relative_equal`
and `relative_spare` (the fudge factor); much larger scale will be
needed for "the large tenant", but that will make debug mode tests
slower.

Cc: #5304
---
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 .../regress/test_disk_usage_eviction.py       | 182 +++++++++++++-----
 2 files changed, 142 insertions(+), 44 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index eef8de876e..fd5e77671b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1095,7 +1095,9 @@ class NeonEnv:
         assert that there is only one. Tests with multiple pageservers should always use
         get_pageserver with an explicit ID.
         """
-        assert len(self.pageservers) == 1
+        assert (
+            len(self.pageservers) == 1
+        ), "env.pageserver must only be used with single pageserver NeonEnv"
         return self.pageservers[0]
 
     def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 70c3b77516..6a4f0edbea 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -2,7 +2,7 @@ import enum
 import time
 from collections import Counter
 from dataclasses import dataclass
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Iterable, Tuple
 
 import pytest
 import toml
@@ -121,17 +121,7 @@ class EvictionEnv:
         }
 
     def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
-        ret: Counter[TenantId] = Counter()
-
-        for tenant_id, timeline_id in self.timelines:
-            timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
-            assert timeline_dir.exists()
-            for file in timeline_dir.iterdir():
-                if "__" not in file.name:
-                    continue
-                ret[tenant_id] += 1
-
-        return dict(ret)
+        return count_layers_per_tenant(pageserver, self.timelines)
 
     def warm_up_tenant(self, tenant_id: TenantId):
         """
@@ -199,6 +189,22 @@ class EvictionEnv:
         wait_until(10, 1, statvfs_called)
 
 
+def count_layers_per_tenant(
+    pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]]
+) -> Dict[TenantId, int]:
+    ret: Counter[TenantId] = Counter()
+
+    for tenant_id, timeline_id in timelines:
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        assert timeline_dir.exists()
+        for file in timeline_dir.iterdir():
+            if "__" not in file.name:
+                continue
+            ret[tenant_id] += 1
+
+    return dict(ret)
+
+
 def human_bytes(amt: float) -> str:
     suffixes = ["", "Ki", "Mi", "Gi"]
 
@@ -243,21 +249,7 @@ def _eviction_env(
 
     timelines = []
     for scale in pgbench_scales:
-        tenant_id, timeline_id = env.neon_cli.create_tenant(
-            conf={
-                "gc_period": "0s",
-                "compaction_period": "0s",
-                "checkpoint_distance": f"{layer_size}",
-                "image_creation_threshold": "100",
-                "compaction_target_size": f"{layer_size}",
-            }
-        )
-
-        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        timelines.append((tenant_id, timeline_id))
+        timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin))
 
     # stop the safekeepers to avoid on-demand downloads caused by
     # initial logical size calculation triggered by walreceiver connection status
@@ -266,25 +258,13 @@ def _eviction_env(
 
     # after stopping the safekeepers, we know that no new WAL will be coming in
     for tenant_id, timeline_id in timelines:
-        pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
-
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
-        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
-        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
-        pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
-
-        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
-        log.info(f"{layers}")
-        assert (
-            len(layers.historic_layers) >= 10
-        ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+        pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10)
 
     eviction_env = EvictionEnv(
         timelines=timelines,
         neon_env=env,
-        pageserver_http=pageserver_http,
+        # this last tenant http client works for num_pageservers=1
+        pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(),
         layer_size=layer_size,
         pg_bin=pg_bin,
         pgbench_init_lsns=pgbench_init_lsns,
@@ -293,6 +273,49 @@ def _eviction_env(
     return eviction_env
 
 
+def pgbench_init_tenant(
+    layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin
+) -> Tuple[TenantId, TimelineId]:
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{layer_size}",
+            "image_creation_threshold": "100",
+            "compaction_target_size": f"{layer_size}",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    return (tenant_id, timeline_id)
+
+
+def finish_tenant_creation(
+    env: NeonEnv,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    min_expected_layers: int,
+) -> Lsn:
+    pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
+    assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
+    assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
+    pgbench_init_lsn = Lsn(tl_info["last_record_lsn"])
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    # log.info(f"{layers}")
+    assert (
+        len(layers.historic_layers) >= min_expected_layers
+    ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+
+    return pgbench_init_lsn
+
+
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
     return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
@@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             assert abs_diff < 0.05
 
 
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder):
+    """
+    Create in order first smaller tenants and finally a single larger tenant.
+    Assert that with relative order modes, the disk usage based eviction is
+    more fair towards the smaller tenants.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+
+    # initial_tenant and initial_timeline do not exist
+
+    # create N tenants the same fashion as EvictionEnv
+    layer_size = 5 * 1024**2
+    timelines = []
+    for scale in [1, 1, 1, 4]:
+        timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
+
+    env.neon_cli.safekeeper_stop()
+
+    for (tenant_id, timeline_id), scale in timelines:
+        min_expected_layers = 4 if scale == 1 else 10
+        finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
+
+    tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+
+    # cut 10 percent
+    response = env.pageserver.http_client().disk_usage_eviction_run(
+        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+    )
+    log.info(f"{response}")
+
+    after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+
+    ratios = []
+    for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines):
+        # we expect the oldest to suffer most
+        originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id]
+        log.info(f"{i + 1}th tenant went from {originally} -> {after}")
+        ratio = after / originally
+        ratios.append(ratio)
+
+    assert (
+        len(ratios) == 4
+    ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
+    log.info(f"{ratios}")
+
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # first tenant loses most
+        assert ratios[0] <= ratios[1], "first should lose the most"
+        assert ratios[1] < ratios[2], "second should lose some"
+        assert ratios[1] < 1.0
+        assert ratios[2] <= ratios[3], "third might not lose"
+        assert ratios[3] == 1.0, "tenant created last does not lose"
+    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
+    elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
+        # with different layer sizes and pg versions, there are different combinations
+        assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers"
+        assert ratios[3] < 1.0, "largest tenant always loses layers"
+    else:
+        raise RuntimeError(f"unimplemented {order}")
+
+
 def poor_mans_du(
     env: NeonEnv,
-    timelines: list[Tuple[TenantId, TimelineId]],
+    timelines: Iterable[Tuple[TenantId, TimelineId]],
     pageserver: NeonPageserver,
     verbose: bool = False,
 ) -> Tuple[int, int, int]: