From dc2ab4407f8b9636a6e570818154f21fde14b9ce Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Jun 2024 00:31:31 +0300 Subject: [PATCH] Fix on-demand SLRU download on standby starting at WAL segment boundary (#8031) If a standby is started right after switching to a new WAL segment, the request in the SLRU download request would point to the beginning of the segment (e.g. 0/5000000), while the not-modified-since LSN would point to just after the page header (e.g. 0/5000028). It's effectively the same position, as there cannot be any WAL records in between, but the pageserver rightly errors out on any request where the request LSN < not-modified since LSN. To fix, round down the not-modified since LSN to the beginning of the page like the request LSN. Fixes issue #8030 --- pgxn/neon/pagestore_smgr.c | 4 +-- .../regress/test_ondemand_slru_download.py | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 6305f2ec92..8edaf65639 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf request_lsn = UINT64_MAX; /* - * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU + * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU * segment has not changed since the basebackup, because in order to * modify it, we would have had to download it already. And once * downloaded, we never evict SLRU segments from local disk. */ - not_modified_since = GetRedoStartLsn(); + not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); SlruKind kind; diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 4af7dcdfc3..d6babe4393 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -129,3 +129,33 @@ def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count cur_replica = conn_replica.cursor() cur_replica.execute("SELECT * FROM clogtest") assert cur_replica.fetchall() == [(1,), (3,)] + + +def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder): + """ + Test on-demand SLRU download on standby, when starting right after + WAL segment switch. + + This is a repro for a bug in how the LSN at WAL page/segment + boundary was handled (https://github.com/neondatabase/neon/issues/8030) + """ + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + endpoint = env.endpoints.create_start("main") + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Start standby at WAL segment boundary + cur.execute("SELECT pg_switch_wal()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + _endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + )