From 6a1903987ac9e700ab24f8dd22e2ea22b30c87e7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Sep 2023 09:15:43 +0100
Subject: [PATCH] tests: use approximate equality in
 test_get_tenant_size_with_multiple_branches (#5411)

## Problem

This test has been flaky for a long time.

As far as I can tell, the test was simply wrong to expect postgres
activity to result in deterministic sizes: making the match fuzzy is not
a hack, it's just matching the reality that postgres doesn't promise to
write exactly the same number of pages every time it runs a given query.

## Summary of changes

Equalities now tolerate up to 4 pages different. This is big enough to
tolerate the deltas we've seen in practice.

Closes: https://github.com/neondatabase/neon/issues/2962
---
 test_runner/regress/test_tenant_size.py | 27 ++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 49a6ca5a53..7cea301a9c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.pg_version import PgVersion, xfail_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
@@ -532,7 +532,24 @@ def test_single_branch_get_tenant_size_grows(
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
 
-@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
+def assert_size_approx_equal(size_a, size_b):
+    """
+    Tests that evaluate sizes are checking the pageserver space consumption
+    that sits many layers below the user input.  The exact space needed
+    varies slightly depending on postgres behavior.
+
+    Rather than expecting postgres to be determinstic and occasionally
+    failing the test, we permit sizes for the same data to vary by a few pages.
+    """
+
+    # Determined empirically from examples of equality failures: they differ
+    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
+    # failing on outliers from that observed range.
+    threshold = 4 * 8272
+
+    assert size_a == pytest.approx(size_b, abs=threshold)
+
+
 def test_get_tenant_size_with_multiple_branches(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
@@ -573,7 +590,7 @@ def test_get_tenant_size_with_multiple_branches(
     )
 
     size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert size_after_first_branch == size_at_branch
+    assert_size_approx_equal(size_after_first_branch, size_at_branch)
 
     first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
 
@@ -599,7 +616,7 @@ def test_get_tenant_size_with_multiple_branches(
         "second-branch", main_branch_name, tenant_id
     )
     size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert size_after_second_branch == size_after_continuing_on_main
+    assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
 
     second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
 
@@ -635,7 +652,7 @@ def test_get_tenant_size_with_multiple_branches(
     # tenant_size but so far this has been reliable, even though at least gc
     # and tenant_size race for the same locks
     size_after = http_client.tenant_size(tenant_id)
-    assert size_after == size_after_thinning_branch
+    assert_size_approx_equal(size_after, size_after_thinning_branch)
 
     size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
     size_debug = http_client.tenant_size_debug(tenant_id)