diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ed409d3130..5a0894cd1b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::decode_wal_record; use crate::walrecord::DecodedWALRecord; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; @@ -310,11 +311,13 @@ async fn import_wal( let mut nrecords = 0; let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; + walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; WAL_INGEST.records_committed.inc(); @@ -449,11 +452,12 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; modification.commit(ctx).await?; last_lsn = lsn; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index edcbac970b..c26abca1f7 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; use crate::{aux_file, repository::*}; -use anyhow::{bail, ensure, Context}; +use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use pageserver_api::key::{ @@ -168,7 +168,9 @@ impl Timeline { DatadirModification { tline: self, pending_lsns: Vec::new(), - pending_updates: HashMap::new(), + pending_metadata_pages: HashMap::new(), + pending_data_pages: Vec::new(), + pending_zero_data_pages: Default::default(), pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), @@ -1031,10 +1033,24 @@ pub struct DatadirModification<'a> { // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, - pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications + /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'. + pending_metadata_pages: HashMap>, + + /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for + /// which keys are stored here. + pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>, + + // Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However, + // if we encounter a write from postgres in the same wal record, we will drop this entry. + // + // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed + // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn + pending_zero_data_pages: HashSet, + /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. pending_directory_entries: Vec<(DirectoryKind, usize)>, @@ -1058,6 +1074,10 @@ impl<'a> DatadirModification<'a> { self.pending_bytes } + pub(crate) fn has_dirty_data_pages(&self) -> bool { + (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty()) + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -1066,6 +1086,10 @@ impl<'a> DatadirModification<'a> { lsn, self.lsn ); + + // If we are advancing LSN, then state from previous wal record should have been flushed. + assert!(self.pending_zero_data_pages.is_empty()); + if lsn > self.lsn { self.pending_lsns.push(self.lsn); self.lsn = lsn; @@ -1073,6 +1097,17 @@ impl<'a> DatadirModification<'a> { Ok(()) } + /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means + /// keys that represent literal blocks that postgres can read. So data includes relation blocks and + /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata. + /// + /// The distinction is important because data keys are handled on a fast path where dirty writes are + /// not readable until this modification is committed, whereas metadata keys are visible for read + /// via [`Self::get`] as soon as their record has been ingested. + fn is_data_key(key: &Key) -> bool { + key.is_rel_block_key() || key.is_slru_block_key() + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -1180,6 +1215,31 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) { + self.pending_zero_data_pages + .insert(rel_block_to_key(rel, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + pub(crate) fn put_slru_page_image_zero( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) { + self.pending_zero_data_pages + .insert(slru_block_to_key(kind, segno, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + /// Call this at the end of each WAL record. + pub(crate) fn on_record_end(&mut self) { + let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages); + for key in pending_zero_data_pages { + self.put_data(key, Value::Image(ZERO_PAGE.clone())); + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1778,7 +1838,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1789,31 +1849,11 @@ impl<'a> DatadirModification<'a> { let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); - for (key, values) in self.pending_updates.drain() { - if !key.is_valid_key_on_write_path() { - bail!( - "the request contains data not supported by pageserver at TimelineWriter::put: {}", key - ); - } - let mut write_batch = Vec::new(); - for (lsn, value_ser_size, value) in values { - if key.is_rel_block_key() || key.is_slru_block_key() { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - write_batch.push((key.to_compact(), lsn, value_ser_size, value)); - } else { - retained_pending_updates.entry(key).or_default().push(( - lsn, - value_ser_size, - value, - )); - } - } - writer.put_batch(write_batch, ctx).await?; - } + let pending_data_pages = std::mem::take(&mut self.pending_data_pages); - self.pending_updates = retained_pending_updates; + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put_batch(pending_data_pages, ctx).await?; self.pending_bytes = 0; if pending_nblocks != 0 { @@ -1834,29 +1874,31 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + // Commit should never be called mid-wal-record + assert!(self.pending_zero_data_pages.is_empty()); + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - if !self.pending_updates.is_empty() { - // Ordering: the items in this batch do not need to be in any global order, but values for - // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on - // this to do efficient updates to its index. - let batch: Vec<(CompactKey, Lsn, usize, Value)> = self - .pending_updates + // Ordering: the items in this batch do not need to be in any global order, but values for + // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on + // this to do efficient updates to its index. + let mut write_batch = std::mem::take(&mut self.pending_data_pages); + + write_batch.extend( + self.pending_metadata_pages .drain() .flat_map(|(key, values)| { - values.into_iter().map(move |(lsn, val_ser_size, value)| { - if !key.is_valid_key_on_write_path() { - bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key); - } - Ok((key.to_compact(), lsn, val_ser_size, value)) - }) - }) - .collect::>>()?; + values + .into_iter() + .map(move |(lsn, value_size, value)| (key, lsn, value_size, value)) + }), + ); - writer.put_batch(batch, ctx).await?; + if !write_batch.is_empty() { + writer.put_batch(write_batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1887,33 +1929,58 @@ impl<'a> DatadirModification<'a> { } pub(crate) fn len(&self) -> usize { - self.pending_updates.len() + self.pending_deletions.len() + self.pending_metadata_pages.len() + + self.pending_data_pages.len() + + self.pending_deletions.len() } - // Internal helper functions to batch the modifications - + /// Read a page from the Timeline we are writing to. For metadata pages, this passes through + /// a cache in Self, which makes writes earlier in this modification visible to WAL records later + /// in the modification. + /// + /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data + /// page must ensure that the pages they read are already committed in Timeline, for example + /// DB create operations are always preceded by a call to commit(). This is special cased because + /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes, + /// and not data pages. async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the latest pending updated - // version in that case. - // - // Note: we don't check pending_deletions. It is an error to request a - // value that has been removed, deletion only avoids leaking storage. - if let Some(values) = self.pending_updates.get(&key) { - if let Some((_, _, value)) = values.last() { - return if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::Other(anyhow::anyhow!( - "unexpected pending WAL record" - ))) - }; + if !Self::is_data_key(&key) { + // Have we already updated the same key? Read the latest pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) { + if let Some((_, _, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::Other(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; + } + } + } else { + // This is an expensive check, so we only do it in debug mode. If reading a data key, + // this key should never be present in pending_data_pages. We ensure this by committing + // modifications before ingesting DB create operations, which are the only kind that reads + // data pages during ingest. + if cfg!(debug_assertions) { + for (dirty_key, _, _, _) in &self.pending_data_pages { + debug_assert!(&key.to_compact() != dirty_key); + } + + debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact())) } } + + // Metadata page cache miss, or we're reading a data page. let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); self.tline.get(key, lsn, ctx).await } @@ -1925,11 +1992,40 @@ impl<'a> DatadirModification<'a> { } fn put(&mut self, key: Key, val: Value) { - let values = self.pending_updates.entry(key).or_default(); + if Self::is_data_key(&key) { + self.put_data(key.to_compact(), val) + } else { + self.put_metadata(key.to_compact(), val) + } + } + + fn put_data(&mut self, key: CompactKey, val: Value) { + let val_serialized_size = val.serialized_size().unwrap() as usize; + + // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This + // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend), + // and the subsequent postgres-originating write + if self.pending_zero_data_pages.remove(&key) { + self.pending_bytes -= ZERO_PAGE.len(); + } + + self.pending_bytes += val_serialized_size; + self.pending_data_pages + .push((key, self.lsn, val_serialized_size, val)) + } + + fn put_metadata(&mut self, key: CompactKey, val: Value) { + let values = self.pending_metadata_pages.entry(key).or_default(); // Replace the previous value if it exists at the same lsn if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { + // Update the pending_bytes contribution from this entry, and update the serialized size in place + self.pending_bytes -= *last_value_ser_size; *last_value_ser_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += *last_value_ser_size; + + // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much + // have been generated by synthesized zero page writes prior to the first real write to a page. *last_value = val; return; } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index f31ab4b1e8..2c19e5b19f 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -692,8 +692,13 @@ impl InMemoryLayer { let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Key {} at {} already exists", key, lsn); + // This should not break anything, but is unexpected: ingestion code aims to filter out + // multiple writes to the same key at the same LSN. This happens in cases where our + // ingenstion code generates some write like an empty page, and we see a write from postgres + // to the same key in the same wal record. If one such write makes it through, we + // index the most recent write, implicitly ignoring the earlier write. We log a warning + // because this case is unexpected, and we would like tests to fail if this happens. + warn!("Key {} at {} written twice at same LSN", key, lsn); } } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 0114473eda..cee259e2e0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -31,7 +31,7 @@ use crate::{ task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, - walrecord::DecodedWALRecord, + walrecord::{decode_wal_record, DecodedWALRecord}, }; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; @@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); { - let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(startlsn); let mut uncommitted_records = 0; let mut filtered_records = 0; + + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } + // Deserialize WAL record + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?; + + if decoded.is_dbase_create_copy(timeline.pg_version) + && uncommitted_records > 0 + { + // Special case: legacy PG database creations operate by reading pages from a 'template' database: + // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure + // all earlier writes of data blocks are visible by committing any modification in flight. + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + // Ingest the records without immediately committing them. let ingested = walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; if !ingested { @@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection( || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; - uncommitted_records = 0; - filtered_records = 0; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } // Commit the remaining records. if uncommitted_records > 0 { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 8ccd20adb1..2d3841881b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -57,6 +57,7 @@ use utils::lsn::Lsn; pub struct WalIngest { shard: ShardIdentity, + pg_version: u32, checkpoint: CheckPoint, checkpoint_modified: bool, warn_ingest_lag: WarnIngestLag, @@ -82,6 +83,7 @@ impl WalIngest { Ok(WalIngest { shard: *timeline.get_shard_identity(), + pg_version: timeline.pg_version, checkpoint, checkpoint_modified: false, warn_ingest_lag: WarnIngestLag { @@ -104,10 +106,9 @@ impl WalIngest { /// pub async fn ingest_record( &mut self, - recdata: Bytes, + decoded: DecodedWALRecord, lsn: Lsn, modification: &mut DatadirModification<'_>, - decoded: &mut DecodedWALRecord, ctx: &RequestContext, ) -> anyhow::Result { WAL_INGEST.records_received.inc(); @@ -115,7 +116,12 @@ impl WalIngest { let prev_len = modification.len(); modification.set_lsn(lsn)?; - decode_wal_record(recdata, decoded, pg_version)?; + + if decoded.is_dbase_create_copy(self.pg_version) { + // Records of this type should always be preceded by a commit(), as they + // rely on reading data pages back from the Timeline. + assert!(!modification.has_dirty_data_pages()); + } let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -133,11 +139,11 @@ impl WalIngest { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages // without registering them with the standard mechanism. - self.ingest_heapam_record(&mut buf, modification, decoded, ctx) + self.ingest_heapam_record(&mut buf, modification, &decoded, ctx) .await?; } pg_constants::RM_NEON_ID => { - self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx) .await?; } // Handle other special record types @@ -325,7 +331,7 @@ impl WalIngest { } pg_constants::RM_RELMAP_ID => { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded, ctx) + self.ingest_relmap_page(modification, &xlrec, &decoded, ctx) .await?; } pg_constants::RM_XLOG_ID => { @@ -470,7 +476,7 @@ impl WalIngest { continue; } - self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) + self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx) .await?; } @@ -486,6 +492,8 @@ impl WalIngest { // until commit() is called to flush the data into the repository and update // the latest LSN. + modification.on_record_end(); + Ok(modification.len() > prev_len) } @@ -557,6 +565,7 @@ impl WalIngest { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx) .await?; } else { @@ -1195,7 +1204,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, fsm_physical_page_no); fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1217,7 +1226,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, vm_page_no); vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1687,7 +1696,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, gap_blknum); } } Ok(()) @@ -1753,7 +1762,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + modification.put_slru_page_image_zero(kind, segno, gap_blknum); } } Ok(()) @@ -1827,21 +1836,25 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1983,6 +1996,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -2008,6 +2022,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -2409,7 +2424,6 @@ mod tests { .await .unwrap(); let mut modification = tline.begin_modification(startpoint); - let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); // Decode and ingest wal. We process the wal in chunks because @@ -2417,8 +2431,10 @@ mod tests { for chunk in bytes[xlogoff..].chunks(50) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap(); walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) .instrument(span.clone()) .await .unwrap(); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index edddcefbe1..0c4d575de8 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -160,6 +160,30 @@ pub struct DecodedWALRecord { pub origin_id: u16, } +impl DecodedWALRecord { + /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations + /// by reading other existing relations' data blocks. This is more complex to apply than new-style database + /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. + pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + if self.xl_rmid == pg_constants::RM_DBASE_ID { + let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + match pg_version { + 14 => { + // Postgres 14 database creations are always the legacy kind + info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + } + 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + _ => { + panic!("Unsupported postgres version {pg_version}") + } + } + } else { + false + } + } +} + #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct RelFileNode {