pageserver: only apply ClearVmBits on relevant shards (#9895)

# Problem VM (visibility map) pages are stored and managed as any regular relation page, in the VM fork of the main relation. They are also sharded like other pages. Regular WAL writes to the VM pages (typically performed by vacuum) are routed to the correct shard as usual. However, VM pages are also updated via `ClearVmBits` metadata records emitted when main relation pages are updated. These metadata records were sent to all shards, like other metadata records. This had the following effects: * On shards responsible for VM pages, the `ClearVmBits` applies as expected. * On shard 0, which knows about the VM relation and its size but doesn't necessarily have any VM pages, the `ClearVmBits` writes may have been applied without also having applied the explicit WAL writes to VM pages. * If VM pages are spread across multiple shards (unlikely with 256MB stripe size), all shards may have applied `ClearVmBits` if the pages fall within their local view of the relation size, even for pages they do not own. * On other shards, this caused a relation size cache miss and a DbDir and RelDir lookup before dropping the `ClearVmBits`. With many relations, this could cause significant CPU overhead. This is not believed to be a correctness problem, but this will be verified in #9914. Resolves #9855. # Changes Route `ClearVmBits` metadata records only to the shards responsible for the VM pages. Verification of the current VM handling and cleanup of incomplete VM pages on shard 0 (and potentially elsewhere) is left as follow-up work.
2026-01-04 12:02:55 +00:00 · 2024-11-27 20:44:24 +01:00
parent 9e3cb75bc7
commit da1daa2426
1 changed files with 54 additions and 17 deletions
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -4,6 +4,7 @@
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::pg_constants;
@@ -32,7 +33,8 @@ impl InterpretedWalRecord {
            FlushUncommittedRecords::No
        };

-        let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
+        let metadata_record =
+            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
        let batch = SerializedValueBatch::from_decoded_filtered(
            decoded,
            shard,
@@ -51,8 +53,13 @@ impl InterpretedWalRecord {
 }

 impl MetadataRecord {
-    fn from_decoded(
+    /// Builds a metadata record for this WAL record, if any.
+    ///
+    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// records are broadcast to all shards for simplicity, but this should be improved.
+    fn from_decoded_filtered(
        decoded: &DecodedWALRecord,
+        shard: &ShardIdentity,
        next_record_lsn: Lsn,
        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
@@ -61,26 +68,27 @@ impl MetadataRecord {
        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);

-        match decoded.xl_rmid {
+        // First, generate metadata records from the decoded WAL record.
+        let mut metadata_record = match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
-                Self::decode_heapam_record(&mut buf, decoded, pg_version)
+                Self::decode_heapam_record(&mut buf, decoded, pg_version)?
            }
-            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?,
            // Handle other special record types
-            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
-            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?,
+            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?,
            pg_constants::RM_TBLSPC_ID => {
                tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-                Ok(None)
+                None
            }
-            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?,
            pg_constants::RM_XACT_ID => {
-                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)?
            }
            pg_constants::RM_MULTIXACT_ID => {
-                Self::decode_multixact_record(&mut buf, decoded, pg_version)
+                Self::decode_multixact_record(&mut buf, decoded, pg_version)?
            }
-            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
+            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?,
            // This is an odd duck. It needs to go to all shards.
            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
@@ -89,19 +97,48 @@ impl MetadataRecord {
            // Alternatively, one can make the checkpoint part of the subscription protocol
            // to the pageserver. This should work fine, but can be done at a later point.
            pg_constants::RM_XLOG_ID => {
-                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)?
            }
            pg_constants::RM_LOGICALMSG_ID => {
-                Self::decode_logical_message_record(&mut buf, decoded)
+                Self::decode_logical_message_record(&mut buf, decoded)?
            }
-            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
-            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
+            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?,
+            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?,
            _unexpected => {
                // TODO: consider failing here instead of blindly doing something without
                // understanding the protocol
-                Ok(None)
+                None
+            }
+        };
+
+        // Next, filter the metadata record by shard.
+
+        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+        // of the main relation. These are sharded and managed just like regular relation pages.
+        // See: https://github.com/neondatabase/neon/issues/9855
+        if let Some(
+            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+        ) = metadata_record
+        {
+            let is_local_vm_page = |heap_blk| {
+                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+            };
+            // Send the old and new VM page updates to their respective shards.
+            clear_vm_bits.old_heap_blkno = clear_vm_bits
+                .old_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            clear_vm_bits.new_heap_blkno = clear_vm_bits
+                .new_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            // If neither VM page belongs to this shard, discard the record.
+            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
+                metadata_record = None
            }
        }
+
+        Ok(metadata_record)
    }

    fn decode_heapam_record(