From bc5ec43056773f4a6742fb64dbff681392b02dd3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Oct 2022 18:23:27 +0300
Subject: [PATCH] Fix flaky physical-size tests in test_timeline_size.py.

These two tests, test_timeline_physical_size_post_compaction and
test_timeline_physical_size_post_gc, assumed that after you have
waited for the WAL from a bulk insertion to arrive, and you run a
cycle of checkpoint and compaction, no new layer files are created.
Because if a new layer file is created while we are calculating the
incremental and non-incremental physical sizes, they might differ.

However, the tests used a very small checkpoint_distance, so even a
small amount of WAL generated in PostgreSQL could cause a new layer
file to be created. Autovacuum can kick in at any time, and do that.
That caused occasional failues in the test. I was able to reproduce it
reliably by adding a long delay between the incremental and
non-incremental size calculations:

```
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -129,6 +129,9 @@ async fn build_timeline_info(
         }
     };
     let current_physical_size = Some(timeline.get_physical_size());
+    if include_non_incremental_physical_size {
+        std::thread::sleep(std::time::Duration::from_millis(60000));
+    }

     let info = TimelineInfo {
         tenant_id: timeline.tenant_id,
```

To fix, disable autovacuum for the table. Autovacuum could still kick
in for other tables, e.g. catalog tables, but that seems less likely
to generate enough WAL to causea new layer file to be flushed.

If this continues to be a problem in the future, we could simply retry
the physical size call a few times, if there's a mismatch. A mismatch
could happen every once in a while, but it's very unlikely to happen
more than once or twice in a row.

Fixes https://github.com/neondatabase/neon/issues/2212
---
 test_runner/regress/test_timeline_size.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index d26d5f3afa..d783f897f9 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -270,9 +270,15 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
     pg = env.postgres.create_start("test_timeline_physical_size_post_compaction")
 
+    # We don't want autovacuum to run on the table, while we are calculating the
+    # physical size, because that could cause a new layer to be created and a
+    # mismatch between the incremental and non-incremental size. (If that still
+    # happens, because of some other background activity or autovacuum on other
+    # tables, we could simply retry the size calculations. It's unlikely that
+    # that would happen more than once.)
     pg.safe_psql_many(
         [
-            "CREATE TABLE foo (t text)",
+            "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
             """INSERT INTO foo
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g""",
@@ -297,9 +303,10 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
     pg = env.postgres.create_start("test_timeline_physical_size_post_gc")
 
+    # Like in test_timeline_physical_size_post_compaction, disable autovacuum
     pg.safe_psql_many(
         [
-            "CREATE TABLE foo (t text)",
+            "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
             """INSERT INTO foo
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g""",