From caf868e27481017f19e19d70b4d84495eeb7d07c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 2 Feb 2024 19:46:47 +0200
Subject: [PATCH 1/2] test: assert we eventually free space (#6536)

in `test_statvfs_pressure_{usage,min_avail_bytes}` we now race against
initial logical size calculation on-demand downloading the layers. first
wait out the initial logical sizes, then change the final asserts to be
"eventual", which is not great but it is faster than failing and
retrying.

this issue seems to happen only in debug mode tests.

Fixes: #6510
---
 test_runner/fixtures/pageserver/http.py       | 13 ++++++++
 .../regress/test_disk_usage_eviction.py       | 31 ++++++++++++++----
 test_runner/regress/test_timeline_size.py     | 32 +++----------------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1a8765d830..92e5027a9f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
         self.put(
             f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
         ).raise_for_status()
+
+    def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
+        detail = self.timeline_detail(
+            tenant_id,
+            timeline_id,
+            include_non_incremental_logical_size=True,
+            force_await_initial_logical_size=True,
+        )
+        current_logical_size = detail["current_logical_size"]
+        non_incremental = detail["current_logical_size_non_incremental"]
+        assert current_logical_size == non_incremental
+        assert isinstance(current_logical_size, int)
+        return current_logical_size
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 6a4f0edbea..dcbf8a5025 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -155,6 +155,15 @@ class EvictionEnv:
         mock_behavior,
         eviction_order: EvictionOrder,
     ):
+        """
+        Starts pageserver up with mocked statvfs setup. The startup is
+        problematic because of dueling initial logical size calculations
+        requiring layers and disk usage based task evicting.
+
+        Returns after initial logical sizes are complete, but the phase of disk
+        usage eviction task is unknown; it might need to run one more iteration
+        before assertions can be made.
+        """
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
             ),
         )
 
+        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
+        for tenant_id, timeline_id in self.timelines:
+            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
+            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
 
+        # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
 
 
@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def less_than_max_usage_pct():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
+    wait_until(2, 2, less_than_max_usage_pct)
 
 
 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def more_than_min_avail_bytes_freed():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert (
+            total_size - post_eviction_total_size >= min_avail_bytes
+        ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+    wait_until(2, 2, more_than_min_avail_bytes_freed)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4c5cb32caa..303aabb58d 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create_start("test_timeline_size")
     log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
     timeline_details = client.timeline_detail(
         env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
     )
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
-# Timeline logical size initialization is an asynchronous background task that runs once,
-# try a few times to ensure it's activated properly
-def wait_for_timeline_size_init(
-    client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-):
-    for i in range(10):
-        timeline_details = client.timeline_detail(
-            tenant, timeline, include_non_incremental_logical_size=True
-        )
-        current_logical_size = timeline_details["current_logical_size"]
-        non_incremental = timeline_details["current_logical_size_non_incremental"]
-        if current_logical_size == non_incremental:
-            return
-        log.info(
-            f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
-    )
-
-
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete

From 2e5eab69c6161bfbf380df355f1ab195171d8601 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 18:20:18 +0000
Subject: [PATCH 2/2] tests: remove test_gc_cutoff (#6587)

This test became flaky when postgres retry handling was fixed to use
backoff delays -- each iteration in this test's loop was taking much
longer because pgbench doesn't fail until postgres has given up on
retrying to the pageserver.

We are just removing it, because the condition it tests is no longer
risky: we reload all metadata from remote storage on restart, so
crashing directly between making local changes and doing remote uploads
isn't interesting any more.

Closes:  https://github.com/neondatabase/neon/issues/2856
Closes: https://github.com/neondatabase/neon/issues/5329
---
 pageserver/src/tenant/timeline.rs     |  4 ---
 test_runner/regress/test_gc_cutoff.py | 47 ---------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 test_runner/regress/test_gc_cutoff.py

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e779f6f32e..0ffe0b6418 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4388,10 +4388,6 @@ impl Timeline {
 
             guard.finish_gc_timeline(&gc_layers);
 
-            if result.layers_removed != 0 {
-                fail_point!("after-timeline-gc-removed-layers");
-            }
-
             #[cfg(feature = "testing")]
             {
                 result.doomed_layers = gc_layers;
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
deleted file mode 100644
index 284a8c3563..0000000000
--- a/test_runner/regress/test_gc_cutoff.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import subprocess
-
-import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-
-
-# Test gc_cutoff
-#
-# This test sets fail point at the end of GC, and checks that pageserver
-# normally restarts after it. Also, there should be GC ERRORs in the log,
-# but the fixture checks the log for any unexpected ERRORs after every
-# test anyway, so it doesn't need any special attention here.
-@pytest.mark.timeout(600)
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "10 s",
-            "gc_horizon": f"{1024 ** 2}",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
-            "compaction_threshold": "3",
-            "image_creation_threshold": "2",
-        }
-    )
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
-    tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
-
-    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
-
-    # Because this test does a rapid series of restarts of the same node, it's possible that
-    # we are restarted again before we can clean up deletion lists form the previous generation,
-    # resulting in a subsequent startup logging a warning.
-    env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
-
-    for _ in range(5):
-        with pytest.raises(subprocess.SubprocessError):
-            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
-        env.pageserver.stop()
-        env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})