mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
Simplify test_wal_page_boundary_start test (#8214)
All the code to ensure the WAL record lands at a page boundary was unnecessary for reproducing the original problem. In fact, it's a pretty basic test that checks that outbound replication (= neon as publisher) still works after restarting the endpoint. It just used to be very broken before commit5ceccdc7de, which also added this test. To verify that: 1. Check out commitf3af5f4660(because the next commit,7dd58e1449, fixed the same bug in a different way, making it infeasible to revert the bug fix in an easy way) 2. Revert the bug fix from commit5ceccdc7dewith this: ``` diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 7debb6325..9f03bbd99 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1437,8 +1437,10 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr) * * https://github.com/neondatabase/neon/issues/5749 */ +#if 0 if (!wp->config->syncSafekeepers) XLogUpdateWalBuffers(buf, recptr, nbytes); +#endif while (nbytes > 0) { ``` 3. Run the test_wal_page_boundary_start regression test. It fails, as expected 4. Apply this commit to the test, and run it again. It still fails, with the same error mentioned in issue #5749: ``` PG:2024-06-30 20:49:08.805 GMT [1248196] STATEMENT: START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"') PG:2024-06-30 21:37:52.567 GMT [1467972] LOG: starting logical decoding for slot "sub1" PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL: Streaming transactions committing after 0/1532330, reading WAL from 0/1531C78. PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT: START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"') PG:2024-06-30 21:37:52.567 GMT [1467972] LOG: logical decoding found consistent point at 0/1531C78 PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL: There are no running transactions. PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT: START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"') PG:2024-06-30 21:37:52.568 GMT [1467972] ERROR: could not find record while sending logically-decoded data: invalid contrecord length 312 (expected 6) at 0/1533FD8 ```
This commit is contained in:
committed by
Vlad Lazar
parent
bfae30a086
commit
0f55da3629
@@ -4,7 +4,6 @@ from random import choice
|
||||
from string import ascii_lowercase
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
|
||||
logical_replication_sync,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def random_string(n: int):
|
||||
@@ -326,12 +325,17 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
|
||||
assert "could not receive data from WAL stream" not in logs
|
||||
|
||||
|
||||
# Test compute start at LSN page of which starts with contrecord
|
||||
# https://github.com/neondatabase/neon/issues/5749
|
||||
# Test replication of WAL record spanning page boundary (with contrecord) after
|
||||
# compute restart and WAL write of the page.
|
||||
#
|
||||
# See https://github.com/neondatabase/neon/issues/5749
|
||||
#
|
||||
# Most pages start with a contrecord, so we don't do anything special
|
||||
# to ensure that.
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
env.neon_cli.create_branch("init")
|
||||
@@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
# measure how much space logical message takes. Sometimes first attempt
|
||||
# creates huge message and then it stabilizes, have no idea why.
|
||||
for _ in range(3):
|
||||
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"current_lsn={lsn_before}")
|
||||
# Non-transactional logical message doesn't write WAL, only XLogInsert's
|
||||
# it, so use transactional. Which is a bit problematic as transactional
|
||||
# necessitates commit record. Alternatively we can do smth like
|
||||
# select neon_xlogflush(pg_current_wal_insert_lsn());
|
||||
# but isn't much better + that particular call complains on 'xlog flush
|
||||
# request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
|
||||
# page headers.
|
||||
payload = "blahblah"
|
||||
cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
|
||||
lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
|
||||
logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
|
||||
log.info(
|
||||
f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
|
||||
)
|
||||
|
||||
# and write logical message spanning exactly as we want
|
||||
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"current_lsn={lsn_before}")
|
||||
curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
offs = int(curr_lsn) % 8192
|
||||
till_page = 8192 - offs
|
||||
payload_len = (
|
||||
till_page - logical_message_base - 8
|
||||
) # not sure why 8 is here, it is deduced from experiments
|
||||
log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
|
||||
|
||||
# payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
|
||||
payload_len += 8
|
||||
|
||||
cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
|
||||
supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
|
||||
# The calculations to hit the page boundary are very fuzzy, so just
|
||||
# ignore test if we fail to reach it.
|
||||
if not (int(supposedly_contrecord_end) % 8192 == 32):
|
||||
pytest.skip("missed page boundary, bad luck")
|
||||
|
||||
cur.execute("insert into replication_example values (2, 3)")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
endpoint.stop().start()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user