From 7da47d8a0aa65785c90fb1cdd096f3416b1e49ab Mon Sep 17 00:00:00 2001
From: Thang Pham <thang@neon.tech>
Date: Fri, 12 Aug 2022 14:28:50 +0700
Subject: [PATCH] Fix timeline physical size flaky tests (#2244)

Resolves #2212.

- use `wait_for_last_flush_lsn` in `test_timeline_physical_size_*` tests

## Context
Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk.

### Note
The `LayerFlushThread` threads are processed **concurrently**, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly
---
 test_runner/batch_others/test_timeline_size.py | 16 +++++++++++++++-
 test_runner/fixtures/neon_fixtures.py          |  6 ++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py
index c736893f99..6e1168e38f 100644
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -4,7 +4,7 @@ from uuid import UUID
 import re
 import psycopg2.extras
 import psycopg2.errors
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn
 from fixtures.log_helper import log
 import time
 
@@ -192,6 +192,8 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
            FROM generate_series(1, 1000) g""",
     ])
 
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+
     # restart the pageserer to force calculating timeline's initial physical size
     env.pageserver.stop()
     env.pageserver.start()
@@ -211,7 +213,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
            FROM generate_series(1, 1000) g""",
     ])
 
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
     assert_physical_size(env, env.initial_tenant, new_timeline_id)
 
 
@@ -232,8 +236,10 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
            FROM generate_series(1, 100000) g""",
     ])
 
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
     env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}")
+
     assert_physical_size(env, env.initial_tenant, new_timeline_id)
 
 
@@ -254,15 +260,21 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g""",
     ])
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
     pg.safe_psql("""
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
             FROM generate_series(1, 100000) g
     """)
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
 
     env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0")
+
     assert_physical_size(env, env.initial_tenant, new_timeline_id)
 
 
@@ -279,6 +291,7 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
            FROM generate_series(1, 100000) g""",
     ])
 
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
 
     # get the metrics and parse the metric for the current timeline's physical size
@@ -319,6 +332,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
             f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
         ])
 
+        wait_for_last_flush_lsn(env, pg, tenant, timeline)
         env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}")
 
         timeline_total_size += get_timeline_physical_size(timeline)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3b87f290b8..d5b0af3813 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2475,3 +2475,9 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient,
         time.sleep(1)
     raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format(
         lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
+
+
+def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID):
+    """Wait for pageserver to catch up the latest flush LSN"""
+    last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)