From 540973eac42135e8aec39e5d7b2f0c7b7a405023 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 17 Sep 2021 18:56:05 +0300 Subject: [PATCH] Don't get confused on request of latest page version with very old LSN. If the 'latest' flag in the client request is true, the client wants the latest page version regardless of the LSN in the request. The LSN is just a hint in that case, indicating that the page hasn't been modified since since that LSN. The LSN can be very old, so it's possible that the page server has already garbage collected away the layer at that LSN. We tried to fetch the old layer and errored out if that happened. To fix, always fetch the data as of last-record-LSN, if 'latest' is set in the client request. We now only use the LSN to wait if the requested LSN hasn't been received and processed yet. Fixes https://github.com/zenithdb/zenith/issues/567 --- pageserver/src/page_service.rs | 56 +++++++++++++--- .../batch_others/test_old_request_lsn.py | 65 +++++++++++++++++++ 2 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 test_runner/batch_others/test_old_request_lsn.py diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1947e4b059..78e25df563 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -313,14 +313,52 @@ impl PageServerHandler { Ok(()) } - /// Helper function to wait for given lsn, or get the latest LSN if it's 0 + /// Helper function to handle the LSN from client request. /// - /// When invalid LSN is requested, it means "don't wait, return latest version of the page" - /// This is necessary for bootstrap. - fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn) -> Result { - if lsn == Lsn(0) { - Ok(timeline.get_last_record_lsn()) + /// Each GetPage (and Exists and Nblocks) request includes information about + /// which version of the page is being requested. The client can request the + /// latest version of the page, or the version that's valid at a particular + /// LSN. The primary compute node will always request the latest page + /// version, while a standby will request a version at the LSN that it's + /// currently caught up to. + /// + /// In either case, if the page server hasn't received the WAL up to the + /// requested LSN yet, we will wait for it to arrive. The return value is + /// the LSN that should be used to look up the page versions. + fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result { + if latest { + // Latest page version was requested. If LSN is given, it is a hint + // to the page server that there have been no modifications to the + // page after that LSN. If we haven't received WAL up to that point, + // wait until it arrives. + let last_record_lsn = timeline.get_last_record_lsn(); + + // Note: this covers the special case that lsn == Lsn(0). That + // special case means "return the latest version whatever it is", + // and it's used for bootstrapping purposes, when the page server is + // connected directly to the compute node. That is needed because + // when you connect to the compute node, to receive the WAL, the + // walsender process will do a look up in the pg_authid catalog + // table for authentication. That poses a deadlock problem: the + // catalog table lookup will send a GetPage request, but the GetPage + // request will block in the page server because the recent WAL + // hasn't been received yet, and it cannot be received until the + // walsender completes the authentication and starts streaming the + // WAL. + if lsn <= last_record_lsn { + Ok(last_record_lsn) + } else { + timeline.wait_lsn(lsn)?; + // Since we waited for 'lsn' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the + // last-record LSN can advance immediately after we return + // anyway) + Ok(lsn) + } } else { + if lsn == Lsn(0) { + bail!("invalid LSN(0) in request"); + } timeline.wait_lsn(lsn)?; Ok(lsn) } @@ -332,7 +370,7 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let exists = timeline.get_rel_exists(tag, lsn)?; @@ -347,7 +385,7 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let n_blocks = timeline.get_relish_size(tag, lsn)?; @@ -366,7 +404,7 @@ impl PageServerHandler { req: &PagestreamGetPageRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py new file mode 100644 index 0000000000..fafa37fe5a --- /dev/null +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -0,0 +1,65 @@ +from contextlib import closing + +from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver + +pytest_plugins = ("fixtures.zenith_fixtures") + +# +# Test where Postgres generates a lot of WAL, and it's garbage collected away, but +# no pages are evicted so that Postgres uses an old LSN in a GetPage request. +# We had a bug where the page server failed to find the page version because it +# thought it was already garbage collected away, because the LSN in the GetPage +# request was very old and the WAL from that time was indeed already removed. +# In reality, the LSN on a GetPage request coming from a primary server is +# just a hint that the page hasn't been modified since that LSN, and the page +# server should return the latest page version regardless of the LSN. +# +def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin): + # Create a branch for us + zenith_cli.run(["branch", "test_old_request_lsn", "empty"]) + pg = postgres.create_start('test_old_request_lsn') + print('postgres is running on test_old_request_lsn branch') + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Get the timeline ID of our branch. We need it for the 'do_gc' command + cur.execute("SHOW zenith.zenith_timeline") + timeline = cur.fetchone()[0] + + psconn = pageserver.connect() + pscur = psconn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers. + cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)') + cur.execute(''' + INSERT INTO foo + SELECT g, 1, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + ''') + + # Verify that the table is larger than shared_buffers, so that the SELECT below + # will cause GetPage requests. + cur.execute(''' + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + ''') + row = cur.fetchone() + print("shared_buffers is {}, table size {}", row[0], row[1]); + assert int(row[0]) < int(row[1]) + + cur.execute('VACUUM foo'); + + # Make a lot of updates on a single row, generating a lot of WAL. Trigger + # garbage collections so that the page server will remove old page versions. + for i in range(10): + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + for j in range(100): + cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;'); + + # All (or at least most of) the updates should've been on the same page, so + # that we haven't had to evict any dirty pages for a long time. Now run + # a query that sends GetPage@LSN requests with the old LSN. + cur.execute("SELECT COUNT(*), SUM(val) FROM foo"); + assert cur.fetchone() == (100000, 101000)