From a9fda8c8327b39c9d543bf22c02186c279cc152a Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 23 Apr 2024 14:03:33 +0100 Subject: [PATCH] pageserver: fix vectored read aux key handling (#7404) ## Problem Vectored get would descend into ancestor timelines for aux files. This is not the behaviour of the legacy read path and blocks cutting over to the vectored read path. Fixes https://github.com/neondatabase/neon/issues/7379 ## Summary of Changes Treat non inherited keys specially in vectored get. At the point when we want to descend into the ancestor mark all pending non inherited keys as errored out at the key level. Note that this diverges from the standard vectored get behaviour for missing keys which is a top level error. This divergence is required to avoid blocking compaction in case such an error is encountered when compaction aux files keys. I'm pretty sure the bug I just described predates the vectored get implementation, but it's still worth fixing. --- libs/pageserver_api/src/key.rs | 8 ++-- libs/pageserver_api/src/keyspace.rs | 53 ++++++++++++++++++++++++--- pageserver/src/tenant.rs | 57 +++++++++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 45 ++++++++++++++++++++++- 4 files changed, 152 insertions(+), 11 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 852670af2c..1d66dd8878 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -48,11 +48,11 @@ impl Key { } } - pub fn next(&self) -> Key { + pub const fn next(&self) -> Key { self.add(1) } - pub fn add(&self, x: u32) -> Key { + pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); @@ -475,12 +475,14 @@ pub const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); + // AUX_FILES currently stores only data for logical replication (slots etc), and // we don't preserve these on a branch because safekeepers can't follow timeline // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY + !NON_INHERITED_RANGE.contains(&key) } #[inline(always)] diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 05fa4562e1..78e4a3d735 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -94,12 +94,13 @@ impl KeySpace { /// Remove all keys in `other` from `self`. /// This can involve splitting or removing of existing ranges. - pub fn remove_overlapping_with(&mut self, other: &KeySpace) { + /// Returns the removed keyspace + pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { let (self_start, self_end) = match (self.start(), self.end()) { (Some(start), Some(end)) => (start, end), _ => { // self is empty - return; + return KeySpace::default(); } }; @@ -112,30 +113,37 @@ impl KeySpace { .skip_while(|range| self_start >= range.end) .take_while(|range| self_end > range.start); + let mut removed_accum = KeySpaceRandomAccum::new(); for range in other_ranges { while let Some(overlap_at) = self.overlaps_at(range) { let overlapped = self.ranges[overlap_at].clone(); if overlapped.start < range.start && overlapped.end <= range.end { // Higher part of the range is completely overlapped. + removed_accum.add_range(range.start..self.ranges[overlap_at].end); self.ranges[overlap_at].end = range.start; } if overlapped.start >= range.start && overlapped.end > range.end { // Lower part of the range is completely overlapped. + removed_accum.add_range(self.ranges[overlap_at].start..range.end); self.ranges[overlap_at].start = range.end; } if overlapped.start < range.start && overlapped.end > range.end { // Middle part of the range is overlapped. + removed_accum.add_range(range.clone()); self.ranges[overlap_at].end = range.start; self.ranges .insert(overlap_at + 1, range.end..overlapped.end); } if overlapped.start >= range.start && overlapped.end <= range.end { // Whole range is overlapped + removed_accum.add_range(self.ranges[overlap_at].clone()); self.ranges.remove(overlap_at); } } } + + removed_accum.to_keyspace() } pub fn start(&self) -> Option { @@ -553,7 +561,16 @@ mod tests { Key::from_i128(11)..Key::from_i128(13), ], }; - key_space1.remove_overlapping_with(&key_space2); + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -583,7 +600,17 @@ mod tests { Key::from_i128(14)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(5), + Key::from_i128(8)..Key::from_i128(10), + Key::from_i128(14)..Key::from_i128(15), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -610,7 +637,11 @@ mod tests { Key::from_i128(15)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace::default(); + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -637,7 +668,17 @@ mod tests { let key_space2 = KeySpace { ranges: vec![Key::from_i128(9)..Key::from_i128(19)], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(9)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(19), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 15be6df637..098bad71fb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3859,6 +3859,7 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; + use pageserver_api::key::NON_INHERITED_RANGE; use pageserver_api::keyspace::KeySpace; use rand::{thread_rng, Rng}; use tests::timeline::{GetVectoredError, ShutdownMode}; @@ -4658,6 +4659,62 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_get_vectored_aux_files() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_aux_files")?; + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + + let child_timeline_id = TimelineId::generate(); + tenant + .branch_timeline_test( + tline, + child_timeline_id, + Some(tline.get_last_record_lsn()), + &ctx, + ) + .await?; + + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + let aux_keyspace = KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }; + let read_lsn = child_timeline.get_last_record_lsn(); + + let vectored_res = child_timeline + .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx) + .await; + + child_timeline + .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) + .await; + + let images = vectored_res?; + let mut key = NON_INHERITED_RANGE.start; + while key < NON_INHERITED_RANGE.end { + assert!(matches!( + images[&key], + Err(PageReconstructError::MissingKey(_)) + )); + key = key.next(); + } + + Ok(()) + } + // Test that vectored get handles layer gaps correctly // by advancing into the next ancestor timeline if required. // diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index fa7d219fb0..fb5ee0a8fa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -16,7 +16,7 @@ use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ - key::AUX_FILES_KEY, + key::{AUX_FILES_KEY, NON_INHERITED_RANGE}, keyspace::KeySpaceAccum, models::{ CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, @@ -943,7 +943,13 @@ impl Timeline { Err(MissingKey(MissingKeyError { stuck_at_lsn: false, .. - })) => return Err(GetVectoredError::MissingKey(key)), + })) if !NON_INHERITED_RANGE.contains(&key) => { + // The vectored read path handles non inherited keys specially. + // If such a a key cannot be reconstructed from the current timeline, + // the vectored read path returns a key level error as opposed to a top + // level error. + return Err(GetVectoredError::MissingKey(key)); + } _ => { values.insert(key, block); key = key.next(); @@ -3024,6 +3030,41 @@ impl Timeline { .await?; keyspace.remove_overlapping_with(&completed); + + // Do not descend into the ancestor timeline for aux files. + // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid + // stalling compaction. + // TODO(chi): this will need to be updated for aux files v2 storage + if keyspace.overlaps(&NON_INHERITED_RANGE) { + let removed = keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }); + + for range in removed.ranges { + let mut key = range.start; + while key < range.end { + reconstruct_state.on_key_error( + key, + PageReconstructError::MissingKey(MissingKeyError { + stuck_at_lsn: false, + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn, + request_lsn, + ancestor_lsn: None, + traversal_path: Vec::default(), + backtrace: if cfg!(test) { + Some(std::backtrace::Backtrace::force_capture()) + } else { + None + }, + }), + ); + key = key.next(); + } + } + } + if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() { break; }