Fix on-demand SLRU download on standby starting at WAL segment boundary (#8031)

If a standby is started right after switching to a new WAL segment, the
request in the SLRU download request would point to the beginning of the
segment (e.g. 0/5000000), while the not-modified-since LSN would point
to just after the page header (e.g. 0/5000028). It's effectively the
same position, as there cannot be any WAL records in between, but the
pageserver rightly errors out on any request where the request LSN <
not-modified since LSN.

To fix, round down the not-modified since LSN to the beginning of the
page like the request LSN.

Fixes issue #8030
This commit is contained in:
Heikki Linnakangas
2024-06-13 00:31:31 +03:00
committed by GitHub
parent ad0ab3b81b
commit dc2ab4407f
2 changed files with 32 additions and 2 deletions

View File

@@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsn = UINT64_MAX;
/*
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
* GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
* segment has not changed since the basebackup, because in order to
* modify it, we would have had to download it already. And once
* downloaded, we never evict SLRU segments from local disk.
*/
not_modified_since = GetRedoStartLsn();
not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
SlruKind kind;

View File

@@ -129,3 +129,33 @@ def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count
cur_replica = conn_replica.cursor()
cur_replica.execute("SELECT * FROM clogtest")
assert cur_replica.fetchall() == [(1,), (3,)]
def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder):
"""
Test on-demand SLRU download on standby, when starting right after
WAL segment switch.
This is a repro for a bug in how the LSN at WAL page/segment
boundary was handled (https://github.com/neondatabase/neon/issues/8030)
"""
tenant_conf = {
"lazy_slru_download": "true",
}
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
endpoint = env.endpoints.create_start("main")
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Create a test table
cur.execute("CREATE TABLE clogtest (id integer)")
cur.execute("INSERT INTO clogtest VALUES (1)")
# Start standby at WAL segment boundary
cur.execute("SELECT pg_switch_wal()")
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
_endpoint_at_lsn = env.endpoints.create_start(
branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
)