Fix test_pageserver_http_get_wal_receiver_success flaky test. (#1786)

Fixes #1768.

## Context

Previously, to test `get_wal_receiver` API, we make run some DB transactions then call the API to check the latest message's LSN from the WAL receiver. However, this test won't work because it's not guaranteed that the WAL receiver will get the latest WAL from the postgres/safekeeper at the time of making the API call. 

This PR resolves the above issue by adding a "poll and wait" code that waits to retrieve the latest data from the WAL receiver. 

This PR also fixes a bug that tries to compare two hex LSNs, should convert to number before the comparison. See: https://github.com/neondatabase/neon/issues/1768#issuecomment-1133752122.
This commit is contained in:
Thang Pham
2022-05-27 13:33:53 -04:00
committed by GitHub
parent cb8bf1beb6
commit 757746b571

View File

@@ -1,11 +1,14 @@
from typing import Optional
from uuid import uuid4, UUID
import pytest
from fixtures.utils import lsn_from_hex
from fixtures.zenith_fixtures import (
DEFAULT_BRANCH_NAME,
ZenithEnv,
ZenithEnvBuilder,
ZenithPageserverHttpClient,
ZenithPageserverApiException,
wait_until,
)
@@ -73,18 +76,35 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv):
tenant_id, timeline_id = env.zenith_cli.create_tenant()
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
res = client.wal_receiver_get(tenant_id, timeline_id)
assert list(res.keys()) == [
"thread_id",
"wal_producer_connstr",
"last_received_msg_lsn",
"last_received_msg_ts",
]
def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int:
res = client.wal_receiver_get(tenant_id, timeline_id)
# make a DB modification then expect getting a new WAL receiver's data
# a successful `wal_receiver_get` response must contain the below fields
assert list(res.keys()) == [
"thread_id",
"wal_producer_connstr",
"last_received_msg_lsn",
"last_received_msg_ts",
]
assert res["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
last_msg_lsn = lsn_from_hex(res["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL during the time the `wal_receiver_get` API is called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None))
# Make a DB modification then expect getting a new WAL receiver's data.
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
res2 = client.wal_receiver_get(tenant_id, timeline_id)
assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"]
wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn))
def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):