From 811eb88b89207be4342e2e8d4a7d6fc2328e6141 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 21:06:34 +0100
Subject: [PATCH] tests: stabilize test_timeline_size_quota_on_startup (#8255)

## Problem

`test_timeline_size_quota_on_startup` assumed that writing data beyond
the size limit would always be blocked. This is not so: the limit is
only enforced if feedback makes it back from the pageserver to the
safekeeper + compute.

Closes: https://github.com/neondatabase/neon/issues/6562

## Summary of changes

- Modify the test to wait for the pageserver to catch up. The size limit
was never actually being enforced robustly, the original version of this
test was just writing much more than 30MB and about 98% of the time
getting lucky such that the feedback happened to arrive before the tests
for loop was done.
- If the test fails, log the logical size as seen by the pageserver.
---
 test_runner/regress/test_timeline_size.py | 46 +++++++++++++++++------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f47356839c..5e9a42f6b4 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
+    size_limit_mb = 30
+
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
         # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
+        config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"],
     )
     endpoint_main.start()
 
@@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
             # Insert many rows. This query must fail because of space limit
             try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )
 
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
+                def write_rows(count):
+                    for _i in range(count):
+                        cur.execute(
+                            """
+                            INSERT INTO foo
+                                SELECT 'long string to consume some space' || g
+                                FROM generate_series(1, 100) g
+                        """
+                        )
+
+                # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to
+                # the safekeeper, then try to write some more.  We expect either the initial writes or the ones after
+                # the wait_for_last_flush_lsn to generate an exception.
+                #
+                # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562)
+                write_rows(2500)
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                assert logical_size > size_limit_mb * 1024 * 1024
+                write_rows(2500)
+
+                # If we get here, the timeline size limit failed.  Find out from the pageserver how large it
+                # thinks the timeline is.
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                log.error(
+                    f"Query unexpectedly succeeded, pageserver logical size is {logical_size}"
+                )
                 raise AssertionError()
 
             except psycopg2.errors.DiskFull as err: