From 81dd4bc41e59523e7a5a2f52e8b65ff3f16834c5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 24 Aug 2021 15:55:04 +0300
Subject: [PATCH] Fix decoding XLOG_HEAP_DELETE and XLOG_HEAP_UPDATE records.

Because the t_cid field was missing from the XlHeapDelete struct that
corresponds to the PostgreSQL xl_heap_delete struct, the check for the
XLH_DELETE_ALL_VISIBLE_CLEARED flag did not work correctly.

Decoding XlHeapUpdate struct was also missing the t_cid field, but that
didn't cause any immediate problems because in that struct, the t_cid
field is after all the fields that the page server cares about. But fix
that too, as it was an accident waiting to happen.

The bug was mostly hidden by the VM page handling in zenith_wallog_page,
where it forcibly generates a FPW record whenever a VM page is evicted:

    else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
    {
        /*
         * Always WAL-log vm.
         * We should never miss clearing visibility map bits.
         *
         * TODO Is it too bad for performance?
         * Hopefully we do not evict actively used vm too often.
         */
        XLogRecPtr recptr;
        recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
        XLogFlush(recptr);
        lsn = recptr;

But that was just hiding the issue: it's still visible if you had a
read-only node relying on the data in the page server, or you killed and
restarted the primary node, or you started a branch. In the included test
case, I used a new branch to expose this.

Fixes https://github.com/zenithdb/zenith/issues/461
---
 pageserver/src/waldecoder.rs             |  6 ++
 test_runner/batch_others/test_vm_bits.py | 79 ++++++++++++++++++++++++
 vendor/postgres                          |  2 +-
 3 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 test_runner/batch_others/test_vm_bits.py

diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs
index bc072ff657..5b9b4b4ebe 100644
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -368,6 +368,8 @@ impl XlHeapMultiInsert {
 pub struct XlHeapDelete {
     pub xmax: TransactionId,
     pub offnum: OffsetNumber,
+    pub _padding: u16,
+    pub t_cid: u32,
     pub infobits_set: u8,
     pub flags: u8,
 }
@@ -377,6 +379,8 @@ impl XlHeapDelete {
         XlHeapDelete {
             xmax: buf.get_u32_le(),
             offnum: buf.get_u16_le(),
+            _padding: buf.get_u16_le(),
+            t_cid: buf.get_u32_le(),
             infobits_set: buf.get_u8(),
             flags: buf.get_u8(),
         }
@@ -390,6 +394,7 @@ pub struct XlHeapUpdate {
     pub old_offnum: OffsetNumber,
     pub old_infobits_set: u8,
     pub flags: u8,
+    pub t_cid: u32,
     pub new_xmax: TransactionId,
     pub new_offnum: OffsetNumber,
 }
@@ -401,6 +406,7 @@ impl XlHeapUpdate {
             old_offnum: buf.get_u16_le(),
             old_infobits_set: buf.get_u8(),
             flags: buf.get_u8(),
+            t_cid: buf.get_u32(),
             new_xmax: buf.get_u32_le(),
             new_offnum: buf.get_u16_le(),
         }
diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py
new file mode 100644
index 0000000000..92509fcbbb
--- /dev/null
+++ b/test_runner/batch_others/test_vm_bits.py
@@ -0,0 +1,79 @@
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test that the VM bit is cleared correctly at a HEAP_DELETE and
+# HEAP_UPDATE record.
+#
+def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
+    pg = postgres.create_start('test_vm_bit_clear')
+
+    print("postgres is running on 'test_vm_bit_clear' branch")
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    # Install extension containing function needed for test
+    cur.execute('CREATE EXTENSION zenith_test_utils')
+
+    # Create a test table and freeze it to set the VM bit.
+    cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)')
+    cur.execute('INSERT INTO vmtest_delete VALUES (1)')
+    cur.execute('VACUUM FREEZE vmtest_delete')
+
+    cur.execute('CREATE TABLE vmtest_update (id integer PRIMARY KEY)')
+    cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g')
+    cur.execute('VACUUM FREEZE vmtest_update')
+
+    # DELETE and UDPATE the rows.
+    cur.execute('DELETE FROM vmtest_delete WHERE id = 1')
+    cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1')
+
+    # Branch at this point, to test that later
+    zenith_cli.run(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
+
+    # Clear the buffer cache, to force the VM page to be re-fetched from
+    # the page server
+    cur.execute('SELECT clear_buffer_cache()')
+
+    # Check that an index-only scan doesn't see the deleted row. If the
+    # clearing of the VM bit was not replayed correctly, this would incorrectly
+    # return deleted row.
+    cur.execute('''
+    set enable_seqscan=off;
+    set enable_indexscan=on;
+    set enable_bitmapscan=off;
+    ''')
+
+    cur.execute('SELECT * FROM vmtest_delete WHERE id = 1')
+    assert(cur.fetchall() == []);
+    cur.execute('SELECT * FROM vmtest_update WHERE id = 1')
+    assert(cur.fetchall() == []);
+
+    cur.close()
+
+
+    # Check the same thing on the branch that we created right after the DELETE
+    #
+    # As of this writing, the code in smgrwrite() creates a full-page image whenever
+    # a dirty VM page is evicted. If the VM bit was not correctly cleared by the
+    # earlier WAL record, the full-page image hides the problem. Starting a new
+    # server at the right point-in-time avoids that full-page image.
+    pg_new = postgres.create_start('test_vm_bit_clear_new')
+
+    print("postgres is running on 'test_vm_bit_clear_new' branch")
+    pg_new_conn = pg_new.connect()
+    cur_new = pg_new_conn.cursor()
+
+    cur_new.execute('''
+    set enable_seqscan=off;
+    set enable_indexscan=on;
+    set enable_bitmapscan=off;
+    ''')
+
+    cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1')
+    assert(cur_new.fetchall() == []);
+    cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1')
+    assert(cur_new.fetchall() == []);
diff --git a/vendor/postgres b/vendor/postgres
index 0d69e188c8..d6561d27d3 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 0d69e188c8d71af43bf72f9c3d263e0bcfd3d05f
+Subproject commit d6561d27d3db3619b1aa45c9f88c46bcb9de8c4c