Refactor how the request LSNs are tracked in compute (#7377)

Instead of thinking in terms of 'latest' and 'lsn' of the request,
each request has two LSNs: the request LSN and 'not_modified_since'
LSN. The request is nominally made at the request LSN, that determines
what page version we want to see. But as a hint, we also include
'not_modified_since'. It tells the pageserver that the page has not
been modified since that LSN, which allows the pageserver to skip
waiting for newer WAL to arrive, and could allow more optimizations in
the future.

Refactor the internal functions to calculate the request LSN to
calculate both LSNs.

Sending two LSNs to the pageserver requires using the new protocol
version 2. The previous commit added the server support for it, but we
still default to the old protocol for compatibility with old
pageservers. The 'neon.protocol_version' GUC can be used to use the
new protocol.

The new protocol addresses one cause of issue #6211, although you can
still get the same error if you have a standby that is lagging behind
so that the page version it needs is genuinely GC'd away.
This commit is contained in:
Heikki Linnakangas
2024-04-25 19:45:45 +03:00
committed by Heikki Linnakangas
parent 4917f52c88
commit a2a44ea213
9 changed files with 377 additions and 202 deletions

View File

@@ -17,7 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv):
env = neon_simple_env
env.neon_cli.create_branch("test_read_validation", "empty")
endpoint = env.endpoints.create_start("test_read_validation")
endpoint = env.endpoints.create_start(
"test_read_validation",
# Use protocol version 2, because the code that constructs the V1 messages
# assumes that a primary always wants to read the latest version of a page,
# and therefore doesn't work with the test functions below to read an older
# page version.
config_lines=["neon.protocol_version=2"],
)
with closing(endpoint.connect()) as con:
with con.cursor() as c:
@@ -64,7 +71,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
log.info("Cache is clear, reading stale page version")
c.execute(
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))"
)
direct_first = c.fetchone()
assert first == direct_first, "Failed fetch page at historic lsn"
@@ -77,7 +84,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
log.info("Cache is clear, reading latest page version without cache")
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))"
)
direct_latest = c.fetchone()
assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -92,7 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
)
c.execute(
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
)
direct_first = c.fetchone()
assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -102,7 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
)
c.execute(
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))"
)
direct_latest = c.fetchone()
assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -114,7 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
)
c.execute(
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
)
direct_first = c.fetchone()
assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -133,7 +140,14 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
endpoint = env.endpoints.create_start("test_read_validation_neg")
endpoint = env.endpoints.create_start(
"test_read_validation_neg",
# Use protocol version 2, because the code that constructs the V1 messages
# assumes that a primary always wants to read the latest version of a page,
# and therefore doesn't work with the test functions below to read an older
# page version.
config_lines=["neon.protocol_version=2"],
)
with closing(endpoint.connect()) as con:
with con.cursor() as c:
@@ -143,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
log.info("read a page of a missing relation")
try:
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))"
)
raise AssertionError("query should have failed")
except UndefinedTable as e:
@@ -155,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
log.info("read a page at lsn 0")
try:
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))"
)
raise AssertionError("query should have failed")
except IoError as e:
@@ -164,22 +178,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
log.info("Pass NULL as an input")
expected = (None, None, None)
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))"
)
assert c.fetchone() == expected, "Expected null output"
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))"
)
assert c.fetchone() == expected, "Expected null output"
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))"
)
assert c.fetchone() == expected, "Expected null output"
# This check is currently failing, reading beyond EOF is returning a 0-page
log.info("Read beyond EOF")
c.execute(
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))"
)

View File

@@ -173,7 +173,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
# which changes the LSN on the page.
cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
cur.execute(
"select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
)
vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
assert vm_page_at_pageserver == vm_page_in_cache