Fix timeline physical size flaky tests (#2244)

Resolves #2212. - use `wait_for_last_flush_lsn` in `test_timeline_physical_size_*` tests ## Context Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk. ### Note The `LayerFlushThread` threads are processed **concurrently**, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly
2026-01-07 21:42:56 +00:00 · 2022-08-12 14:28:50 +07:00
parent dc52436a8f
commit 7da47d8a0a
2 changed files with 21 additions and 1 deletions
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -4,7 +4,7 @@ from uuid import UUID
 import re
 import psycopg2.extras
 import psycopg2.errors
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn
 from fixtures.log_helper import log
 import time

@@ -192,6 +192,8 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
           FROM generate_series(1, 1000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+
    # restart the pageserer to force calculating timeline's initial physical size
    env.pageserver.stop()
    env.pageserver.start()
@@ -211,7 +213,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
           FROM generate_series(1, 1000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -232,8 +236,10 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
           FROM generate_series(1, 100000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
    env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -254,15 +260,21 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
           SELECT 'long string to consume some space' || g
           FROM generate_series(1, 100000) g""",
    ])
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    pg.safe_psql("""
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g
    """)
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")

    env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -279,6 +291,7 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
           FROM generate_series(1, 100000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")

    # get the metrics and parse the metric for the current timeline's physical size
@@ -319,6 +332,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
            f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
        ])

+        wait_for_last_flush_lsn(env, pg, tenant, timeline)
        env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}")

        timeline_total_size += get_timeline_physical_size(timeline)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2475,3 +2475,9 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient,
        time.sleep(1)
    raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format(
        lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
+
+
+def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID):
+    """Wait for pageserver to catch up the latest flush LSN"""
+    last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)