diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c0c4710a00..12c6dc3a6d 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -307,7 +307,7 @@ impl KeySpace { } /// Merge another keyspace into the current one. - /// Note: the keyspaces must not ovelap (enforced via assertions) + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. pub fn merge(&mut self, other: &KeySpace) { let all_ranges = self .ranges diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e598e9d2e3..1a66f2c919 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3968,7 +3968,7 @@ mod tests { use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; @@ -5996,4 +5996,374 @@ mod tests { Some(&bytes::Bytes::from_static(b"last")) ); } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + tline.freeze_and_flush().await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create a image layer + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("data key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; // this will create a delta + + { + // update the partitioning to include the test key space, otherwise they + // will be dropped by image layer creation + let mut guard = child.partitioning.lock().await; + let ((partitioning, _), partition_lsn) = &mut *guard; + partitioning + .parts + .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key + *partition_lsn = lsn; + } + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set + }, + &ctx, + ) + .await?; // force create an image layer for the keys, TODO: check if the image layer is created + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + base_key_child.field1 = AUX_KEY_PREFIX; + base_key_nonexist.field1 = AUX_KEY_PREFIX; + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put( + base_key, + lsn, + &Value::Image(test_img("metadata key 1")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create an image layer + + tline + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("metadata key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4c8a518551..9ccf20c0d4 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -113,14 +113,17 @@ impl From for ValueReconstructState { } } -/// Bag of data accumulated during a vectored get. +/// Bag of data accumulated during a vectored get.. pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. pub(crate) keys: HashMap>, - + /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, @@ -131,6 +134,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, } @@ -186,6 +190,16 @@ impl ValuesReconstructState { } } + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -248,8 +262,12 @@ impl ValuesReconstructState { /// Returns the key space describing the keys that have /// been marked as completed since the last call to this function. - pub(crate) fn consume_done_keys(&mut self) -> KeySpace { - self.keys_done.consume_keyspace() + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 6ea452b993..becd1e7a6d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -158,6 +158,7 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, file: VirtualFile, @@ -419,6 +420,7 @@ impl ImageLayerInner { file, file_id, max_vectored_read_bytes, + key_range: actual_summary.key_range, })) } @@ -478,6 +480,8 @@ impl ImageLayerInner { self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; + reconstruct_state.on_image_layer_visited(&self.key_range); + Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e6b58b7166..7f2a41d90c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -21,7 +21,7 @@ use pageserver_api::{ AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, - keyspace::{KeySpaceAccum, SparseKeyPartitioning}, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, @@ -348,8 +348,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -483,6 +483,11 @@ impl GcCutoffs { } } +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, +} + /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { @@ -507,6 +512,13 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + #[derive(Debug)] pub struct MissingKeyError { key: Key, @@ -3300,12 +3312,15 @@ impl Timeline { let mut cont_lsn = Lsn(request_lsn.0 + 1); - loop { + let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let completed = Self::get_vectored_reconstruct_data_timeline( + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( timeline, keyspace.clone(), cont_lsn, @@ -3324,12 +3339,31 @@ impl Timeline { ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], }); - // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look - // into ancestor timelines). TODO: is there any other metadata which we want to inherit? - if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { - break; + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; } + // Not fully retrieved but no ancestor timeline. + if timeline.ancestor_timeline.is_none() { + break Some(keyspace); + } + + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline @@ -3337,14 +3371,14 @@ impl Timeline { .await .map_err(GetVectoredError::GetReadyAncestorError)?; timeline = &*timeline_owned; - } + }; - if keyspace.total_raw_size() != 0 { + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { - key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ shard: self .shard_identity - .get_shard_number(&keyspace.start().unwrap()), + .get_shard_number(&missing_keyspace.start().unwrap()), cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), @@ -3369,6 +3403,9 @@ impl Timeline { /// /// At each iteration pop the top of the fringe (the layer with the highest Lsn) /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, keyspace: KeySpace, @@ -3376,20 +3413,27 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { + ) -> Result { let mut unmapped_keyspace = keyspace.clone(); let mut fringe = LayerFringe::new(); let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let keys_done_last_step = reconstruct_state.consume_done_keys(); + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not @@ -3467,7 +3511,10 @@ impl Timeline { } } - Ok(completed_keyspace) + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) } /// # Cancel-safety