From bc5ec43056773f4a6742fb64dbff681392b02dd3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 18 Oct 2022 18:23:27 +0300 Subject: [PATCH] Fix flaky physical-size tests in test_timeline_size.py. These two tests, test_timeline_physical_size_post_compaction and test_timeline_physical_size_post_gc, assumed that after you have waited for the WAL from a bulk insertion to arrive, and you run a cycle of checkpoint and compaction, no new layer files are created. Because if a new layer file is created while we are calculating the incremental and non-incremental physical sizes, they might differ. However, the tests used a very small checkpoint_distance, so even a small amount of WAL generated in PostgreSQL could cause a new layer file to be created. Autovacuum can kick in at any time, and do that. That caused occasional failues in the test. I was able to reproduce it reliably by adding a long delay between the incremental and non-incremental size calculations: ``` --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -129,6 +129,9 @@ async fn build_timeline_info( } }; let current_physical_size = Some(timeline.get_physical_size()); + if include_non_incremental_physical_size { + std::thread::sleep(std::time::Duration::from_millis(60000)); + } let info = TimelineInfo { tenant_id: timeline.tenant_id, ``` To fix, disable autovacuum for the table. Autovacuum could still kick in for other tables, e.g. catalog tables, but that seems less likely to generate enough WAL to causea new layer file to be flushed. If this continues to be a problem in the future, we could simply retry the physical size call a few times, if there's a mismatch. A mismatch could happen every once in a while, but it's very unlikely to happen more than once or twice in a row. Fixes https://github.com/neondatabase/neon/issues/2212 --- test_runner/regress/test_timeline_size.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index d26d5f3afa..d783f897f9 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -270,9 +270,15 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + # We don't want autovacuum to run on the table, while we are calculating the + # physical size, because that could cause a new layer to be created and a + # mismatch between the incremental and non-incremental size. (If that still + # happens, because of some other background activity or autovacuum on other + # tables, we could simply retry the size calculations. It's unlikely that + # that would happen more than once.) pg.safe_psql_many( [ - "CREATE TABLE foo (t text)", + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", @@ -297,9 +303,10 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + # Like in test_timeline_physical_size_post_compaction, disable autovacuum pg.safe_psql_many( [ - "CREATE TABLE foo (t text)", + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""",