mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
Fix timeline physical size flaky tests (#2244)
Resolves #2212. - use `wait_for_last_flush_lsn` in `test_timeline_physical_size_*` tests ## Context Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk. ### Note The `LayerFlushThread` threads are processed **concurrently**, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly
This commit is contained in:
@@ -4,7 +4,7 @@ from uuid import UUID
|
||||
import re
|
||||
import psycopg2.extras
|
||||
import psycopg2.errors
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn
|
||||
from fixtures.log_helper import log
|
||||
import time
|
||||
|
||||
@@ -192,6 +192,8 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
|
||||
FROM generate_series(1, 1000) g""",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
|
||||
# restart the pageserer to force calculating timeline's initial physical size
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
@@ -211,7 +213,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
|
||||
FROM generate_series(1, 1000) g""",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
|
||||
assert_physical_size(env, env.initial_tenant, new_timeline_id)
|
||||
|
||||
|
||||
@@ -232,8 +236,10 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
|
||||
FROM generate_series(1, 100000) g""",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
|
||||
assert_physical_size(env, env.initial_tenant, new_timeline_id)
|
||||
|
||||
|
||||
@@ -254,15 +260,21 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g""",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
|
||||
pg.safe_psql("""
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
""")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
|
||||
env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0")
|
||||
|
||||
assert_physical_size(env, env.initial_tenant, new_timeline_id)
|
||||
|
||||
|
||||
@@ -279,6 +291,7 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
|
||||
FROM generate_series(1, 100000) g""",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
|
||||
|
||||
# get the metrics and parse the metric for the current timeline's physical size
|
||||
@@ -319,6 +332,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
|
||||
])
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant, timeline)
|
||||
env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}")
|
||||
|
||||
timeline_total_size += get_timeline_physical_size(timeline)
|
||||
|
||||
@@ -2475,3 +2475,9 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient,
|
||||
time.sleep(1)
|
||||
raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format(
|
||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
|
||||
|
||||
|
||||
def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID):
|
||||
"""Wait for pageserver to catch up the latest flush LSN"""
|
||||
last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
|
||||
|
||||
Reference in New Issue
Block a user