diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1947e4b059..78e25df563 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -313,14 +313,52 @@ impl PageServerHandler { Ok(()) } - /// Helper function to wait for given lsn, or get the latest LSN if it's 0 + /// Helper function to handle the LSN from client request. /// - /// When invalid LSN is requested, it means "don't wait, return latest version of the page" - /// This is necessary for bootstrap. - fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn) -> Result { - if lsn == Lsn(0) { - Ok(timeline.get_last_record_lsn()) + /// Each GetPage (and Exists and Nblocks) request includes information about + /// which version of the page is being requested. The client can request the + /// latest version of the page, or the version that's valid at a particular + /// LSN. The primary compute node will always request the latest page + /// version, while a standby will request a version at the LSN that it's + /// currently caught up to. + /// + /// In either case, if the page server hasn't received the WAL up to the + /// requested LSN yet, we will wait for it to arrive. The return value is + /// the LSN that should be used to look up the page versions. + fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result { + if latest { + // Latest page version was requested. If LSN is given, it is a hint + // to the page server that there have been no modifications to the + // page after that LSN. If we haven't received WAL up to that point, + // wait until it arrives. + let last_record_lsn = timeline.get_last_record_lsn(); + + // Note: this covers the special case that lsn == Lsn(0). That + // special case means "return the latest version whatever it is", + // and it's used for bootstrapping purposes, when the page server is + // connected directly to the compute node. That is needed because + // when you connect to the compute node, to receive the WAL, the + // walsender process will do a look up in the pg_authid catalog + // table for authentication. That poses a deadlock problem: the + // catalog table lookup will send a GetPage request, but the GetPage + // request will block in the page server because the recent WAL + // hasn't been received yet, and it cannot be received until the + // walsender completes the authentication and starts streaming the + // WAL. + if lsn <= last_record_lsn { + Ok(last_record_lsn) + } else { + timeline.wait_lsn(lsn)?; + // Since we waited for 'lsn' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the + // last-record LSN can advance immediately after we return + // anyway) + Ok(lsn) + } } else { + if lsn == Lsn(0) { + bail!("invalid LSN(0) in request"); + } timeline.wait_lsn(lsn)?; Ok(lsn) } @@ -332,7 +370,7 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let exists = timeline.get_rel_exists(tag, lsn)?; @@ -347,7 +385,7 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let n_blocks = timeline.get_relish_size(tag, lsn)?; @@ -366,7 +404,7 @@ impl PageServerHandler { req: &PagestreamGetPageRequest, ) -> Result { let tag = RelishTag::Relation(req.rel); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py new file mode 100644 index 0000000000..fafa37fe5a --- /dev/null +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -0,0 +1,65 @@ +from contextlib import closing + +from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver + +pytest_plugins = ("fixtures.zenith_fixtures") + +# +# Test where Postgres generates a lot of WAL, and it's garbage collected away, but +# no pages are evicted so that Postgres uses an old LSN in a GetPage request. +# We had a bug where the page server failed to find the page version because it +# thought it was already garbage collected away, because the LSN in the GetPage +# request was very old and the WAL from that time was indeed already removed. +# In reality, the LSN on a GetPage request coming from a primary server is +# just a hint that the page hasn't been modified since that LSN, and the page +# server should return the latest page version regardless of the LSN. +# +def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin): + # Create a branch for us + zenith_cli.run(["branch", "test_old_request_lsn", "empty"]) + pg = postgres.create_start('test_old_request_lsn') + print('postgres is running on test_old_request_lsn branch') + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Get the timeline ID of our branch. We need it for the 'do_gc' command + cur.execute("SHOW zenith.zenith_timeline") + timeline = cur.fetchone()[0] + + psconn = pageserver.connect() + pscur = psconn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers. + cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)') + cur.execute(''' + INSERT INTO foo + SELECT g, 1, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + ''') + + # Verify that the table is larger than shared_buffers, so that the SELECT below + # will cause GetPage requests. + cur.execute(''' + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + ''') + row = cur.fetchone() + print("shared_buffers is {}, table size {}", row[0], row[1]); + assert int(row[0]) < int(row[1]) + + cur.execute('VACUUM foo'); + + # Make a lot of updates on a single row, generating a lot of WAL. Trigger + # garbage collections so that the page server will remove old page versions. + for i in range(10): + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + for j in range(100): + cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;'); + + # All (or at least most of) the updates should've been on the same page, so + # that we haven't had to evict any dirty pages for a long time. Now run + # a query that sends GetPage@LSN requests with the old LSN. + cur.execute("SELECT COUNT(*), SUM(val) FROM foo"); + assert cur.fetchone() == (100000, 101000)