From 6810d2aa53b7b7646013d2f236d155a4f1b4721d Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 20 May 2024 14:24:18 -0400 Subject: [PATCH] feat(pageserver): do not read past image layers for vectored get (#7773) ## Problem Part of https://github.com/neondatabase/neon/issues/7462 On metadata keyspace, vectored get will not stop if a key is not found, and will read past the image layer. However, the semantics is different from single get, because if a key does not exist in the image layer, it means that the key does not exist in the past, or have been deleted. This pull request fixed it by recording image layer coverage during the vectored get process and stop when the full keyspace is covered by an image layer. A corresponding test case is added to ensure generating image layer reduces the number of delta layers. This optimization (or bug fix) also applies to rel block keyspaces. If a key is missing, we can know it's missing once the first image layer is reached. Page server will not attempt to read lower layers, which potentially incurs layer downloads + evictions. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/keyspace.rs | 2 +- pageserver/src/tenant.rs | 372 +++++++++++++++++- pageserver/src/tenant/storage_layer.rs | 26 +- .../src/tenant/storage_layer/image_layer.rs | 4 + pageserver/src/tenant/timeline.rs | 79 +++- 5 files changed, 461 insertions(+), 22 deletions(-) diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c0c4710a00..12c6dc3a6d 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -307,7 +307,7 @@ impl KeySpace { } /// Merge another keyspace into the current one. - /// Note: the keyspaces must not ovelap (enforced via assertions) + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. pub fn merge(&mut self, other: &KeySpace) { let all_ranges = self .ranges diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e598e9d2e3..1a66f2c919 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3968,7 +3968,7 @@ mod tests { use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; @@ -5996,4 +5996,374 @@ mod tests { Some(&bytes::Bytes::from_static(b"last")) ); } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + tline.freeze_and_flush().await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create a image layer + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("data key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; // this will create a delta + + { + // update the partitioning to include the test key space, otherwise they + // will be dropped by image layer creation + let mut guard = child.partitioning.lock().await; + let ((partitioning, _), partition_lsn) = &mut *guard; + partitioning + .parts + .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key + *partition_lsn = lsn; + } + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set + }, + &ctx, + ) + .await?; // force create an image layer for the keys, TODO: check if the image layer is created + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + base_key_child.field1 = AUX_KEY_PREFIX; + base_key_nonexist.field1 = AUX_KEY_PREFIX; + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put( + base_key, + lsn, + &Value::Image(test_img("metadata key 1")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create an image layer + + tline + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("metadata key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4c8a518551..9ccf20c0d4 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -113,14 +113,17 @@ impl From for ValueReconstructState { } } -/// Bag of data accumulated during a vectored get. +/// Bag of data accumulated during a vectored get.. pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. pub(crate) keys: HashMap>, - + /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, @@ -131,6 +134,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, } @@ -186,6 +190,16 @@ impl ValuesReconstructState { } } + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -248,8 +262,12 @@ impl ValuesReconstructState { /// Returns the key space describing the keys that have /// been marked as completed since the last call to this function. - pub(crate) fn consume_done_keys(&mut self) -> KeySpace { - self.keys_done.consume_keyspace() + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 6ea452b993..becd1e7a6d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -158,6 +158,7 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, file: VirtualFile, @@ -419,6 +420,7 @@ impl ImageLayerInner { file, file_id, max_vectored_read_bytes, + key_range: actual_summary.key_range, })) } @@ -478,6 +480,8 @@ impl ImageLayerInner { self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; + reconstruct_state.on_image_layer_visited(&self.key_range); + Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e6b58b7166..7f2a41d90c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -21,7 +21,7 @@ use pageserver_api::{ AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, - keyspace::{KeySpaceAccum, SparseKeyPartitioning}, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, @@ -348,8 +348,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -483,6 +483,11 @@ impl GcCutoffs { } } +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, +} + /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { @@ -507,6 +512,13 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + #[derive(Debug)] pub struct MissingKeyError { key: Key, @@ -3300,12 +3312,15 @@ impl Timeline { let mut cont_lsn = Lsn(request_lsn.0 + 1); - loop { + let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let completed = Self::get_vectored_reconstruct_data_timeline( + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( timeline, keyspace.clone(), cont_lsn, @@ -3324,12 +3339,31 @@ impl Timeline { ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], }); - // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look - // into ancestor timelines). TODO: is there any other metadata which we want to inherit? - if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { - break; + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; } + // Not fully retrieved but no ancestor timeline. + if timeline.ancestor_timeline.is_none() { + break Some(keyspace); + } + + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline @@ -3337,14 +3371,14 @@ impl Timeline { .await .map_err(GetVectoredError::GetReadyAncestorError)?; timeline = &*timeline_owned; - } + }; - if keyspace.total_raw_size() != 0 { + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { - key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ shard: self .shard_identity - .get_shard_number(&keyspace.start().unwrap()), + .get_shard_number(&missing_keyspace.start().unwrap()), cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), @@ -3369,6 +3403,9 @@ impl Timeline { /// /// At each iteration pop the top of the fringe (the layer with the highest Lsn) /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, keyspace: KeySpace, @@ -3376,20 +3413,27 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { + ) -> Result { let mut unmapped_keyspace = keyspace.clone(); let mut fringe = LayerFringe::new(); let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let keys_done_last_step = reconstruct_state.consume_done_keys(); + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not @@ -3467,7 +3511,10 @@ impl Timeline { } } - Ok(completed_keyspace) + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) } /// # Cancel-safety