Fix test_remote_storage_upload_queue_retries flakiness (#6898)

* decreases checkpointing and compaction targets for even more layer
files
* write 10 thousand rows 2 times instead of writing 20 thousand rows 1
time so that there is more to GC. Before it was noisily jumping between
1 and 0 layer files, now it's jumping between 19 and 20 layer files. The
0 caused an assertion error that gave the test most of its flakiness.
* larger timeout for the churn while failpoints are active thread: this
is mostly so that the test is more robust on systems with more load

Fixes #3051
This commit is contained in:
Arpad Müller
2024-02-26 13:21:40 +01:00
committed by GitHub
parent 256058f2ab
commit 51a43b121c

View File

@@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries(
tenant_id, timeline_id = env.neon_cli.create_tenant(
conf={
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"checkpoint_distance": f"{64 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
"compaction_target_size": f"{64 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
@@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries(
]
)
FOO_ROWS_COUNT = 4000
def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
# create initial set of layers & upload them with failpoints configured
endpoint.safe_psql_many(
[
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, 20000) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
# to ensure that GC can actually remove some layers
"VACUUM foo",
]
)
for _v in range(2):
endpoint.safe_psql_many(
[
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, {FOO_ROWS_COUNT}) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
# to ensure that GC can actually remove some layers
"VACUUM foo",
]
)
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
def get_queued_count(file_kind, op_kind):
@@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries(
# The churn thread doesn't make progress once it blocks on the first wait_completion() call,
# so, give it some time to wrap up.
churn_while_failpoints_active_thread.join(30)
churn_while_failpoints_active_thread.join(60)
assert not churn_while_failpoints_active_thread.is_alive()
assert churn_thread_result[0]
@@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries(
log.info("restarting postgres to validate")
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT
def test_remote_timeline_client_calls_started_metric(