diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index ea6115853e..2511de00d5 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -80,7 +80,7 @@ impl Key { } /// Get the range of metadata keys. - pub fn metadata_key_range() -> Range { + pub const fn metadata_key_range() -> Range { Key { field1: METADATA_KEY_BEGIN_PREFIX, field2: 0, @@ -572,14 +572,17 @@ pub const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +/// Non inherited range for vectored get. pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); +/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. +pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); // AUX_FILES currently stores only data for logical replication (slots etc), and // we don't preserve these on a branch because safekeepers can't follow timeline // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(key: Key) -> bool { - !NON_INHERITED_RANGE.contains(&key) + !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key) } #[inline(always)] diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d8019b08e2..903bad34cc 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -194,6 +194,11 @@ pub(crate) struct GetVectoredLatency { map: EnumMap>, } +#[allow(dead_code)] +pub(crate) struct ScanLatency { + map: EnumMap>, +} + impl GetVectoredLatency { // Only these task types perform vectored gets. Filter all other tasks out to reduce total // cardinality of the metric. @@ -204,6 +209,48 @@ impl GetVectoredLatency { } } +impl ScanLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) struct ScanLatencyOngoingRecording<'a> { + parent: &'a Histogram, + start: std::time::Instant, +} + +impl<'a> ScanLatencyOngoingRecording<'a> { + pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { + let start = Instant::now(); + ScanLatencyOngoingRecording { parent, start } + } + + pub(crate) fn observe(self, throttled: Option) { + let elapsed = self.start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + if let Some(ex_throttled) = ex_throttled { + self.parent.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } +} + pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", @@ -227,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| } }); +pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_scan_seconds", + "Time spent in scan, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + ScanLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8fa484e7b2..c39c21c6dd 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3925,7 +3925,7 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; - use pageserver_api::key::NON_INHERITED_RANGE; + use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::CompactionAlgorithm; use rand::{thread_rng, Rng}; @@ -4791,15 +4791,7 @@ mod tests { .await; let images = vectored_res?; - let mut key = NON_INHERITED_RANGE.start; - while key < NON_INHERITED_RANGE.end { - assert!(matches!( - images[&key], - Err(PageReconstructError::MissingKey(_)) - )); - key = key.next(); - } - + assert!(images.is_empty()); Ok(()) } @@ -5500,4 +5492,116 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_metadata_scan() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_scan")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 100; // random update + scan base_key + idx * STEP + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0x10); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for _ in 0..10 { + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = (blknum * STEP) as u32; + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", blknum, last_lsn)) + ); + } + + let mut cnt = 0; + for (key, value) in tline + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + &ctx, + ) + .await? + { + let blknum = key.field6 as usize; + let value = value?; + assert!(blknum % STEP == 0); + let blknum = blknum / STEP; + assert_eq!( + value, + test_img(&format!("{} at {}", blknum, updated[blknum])) + ); + cnt += 1; + } + + assert_eq!(cnt, NUM_KEYS); + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Perform a cycle of flush, compact, and GC + let cutoff = tline.get_last_record_lsn(); + tline + .update_gc_info( + Vec::new(), + cutoff, + Duration::ZERO, + &CancellationToken::new(), + &ctx, + ) + .await?; + tline.freeze_and_flush().await?; + tline + .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .await?; + tline.gc().await?; + } + + Ok(()) + } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 19228bc1f1..c7a5598cec 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -16,7 +16,10 @@ use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ - key::{AUX_FILES_KEY, NON_INHERITED_RANGE}, + key::{ + AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + NON_INHERITED_SPARSE_RANGE, + }, keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, @@ -55,7 +58,6 @@ use std::{ ops::ControlFlow, }; -use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -77,6 +79,9 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, +}; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, @@ -885,16 +890,15 @@ impl Timeline { value } } - None => { - error!( - "Expected {}, but singular vectored get returned nothing", - key - ); - Err(PageReconstructError::Other(anyhow!( - "Singular vectored get did not return a value for {}", - key - ))) - } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + traversal_path: Vec::new(), + backtrace: None, + })), } } } @@ -1044,6 +1048,70 @@ impl Timeline { res } + /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored + /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found + /// during the search, but for the scan interface, it returns all existing key-value pairs, and does + /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB + /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored + /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that + /// the scan operation will not cause OOM in the future. + #[allow(dead_code)] + pub(crate) async fn scan( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + trace!( + "key-value scan request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind() + ); + + // We should generalize this into Keyspace::contains in the future. + for range in &keyspace.ranges { + if range.start.field1 < METADATA_KEY_BEGIN_PREFIX + || range.end.field1 >= METADATA_KEY_END_PREFIX + { + return Err(GetVectoredError::Other(anyhow::anyhow!( + "only metadata keyspace can be scanned" + ))); + } + } + + let start = crate::metrics::SCAN_LATENCY + .for_task_kind(ctx.task_kind()) + .map(ScanLatencyOngoingRecording::start_recording); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + // assume scan = 1 quota for now until we find a better way to process this + .throttle(ctx, 1) + .await; + + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + ctx, + ) + .await; + + if let Some(recording) = start { + recording.observe(throttled); + } + + vectored_res + } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn get_vectored_sequential_impl( &self, @@ -1052,6 +1120,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result>, GetVectoredError> { let mut values = BTreeMap::new(); + for range in keyspace.ranges { let mut key = range.start; while key != range.end { @@ -1064,12 +1133,16 @@ impl Timeline { Err(Cancelled | AncestorStopping(_)) => { return Err(GetVectoredError::Cancelled) } - // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380 - Err(MissingKey(err)) if !NON_INHERITED_RANGE.contains(&key) => { - // The vectored read path handles non inherited keys specially. - // If such a a key cannot be reconstructed from the current timeline, - // the vectored read path returns a key level error as opposed to a top - // level error. + Err(MissingKey(_)) + if NON_INHERITED_RANGE.contains(&key) + || NON_INHERITED_SPARSE_RANGE.contains(&key) => + { + // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. + // When we add more types of keys into the page server, we should revisit this part of code and throw errors + // accordingly. + key = key.next(); + } + Err(MissingKey(err)) => { return Err(GetVectoredError::MissingKey(err)); } Err(Other(err)) @@ -1157,6 +1230,11 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) { + if keyspace.overlaps(&Key::metadata_key_range()) { + // skip validation for metadata key range + return; + } + let sequential_res = self .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) .await; @@ -3209,36 +3287,12 @@ impl Timeline { // Do not descend into the ancestor timeline for aux files. // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. - // TODO(chi): this will need to be updated for aux files v2 storage - if keyspace.overlaps(&NON_INHERITED_RANGE) { - let removed = keyspace.remove_overlapping_with(&KeySpace { - ranges: vec![NON_INHERITED_RANGE], - }); - - for range in removed.ranges { - let mut key = range.start; - while key < range.end { - reconstruct_state.on_key_error( - key, - PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn, - request_lsn, - ancestor_lsn: None, - traversal_path: Vec::default(), - backtrace: if cfg!(test) { - Some(std::backtrace::Backtrace::force_capture()) - } else { - None - }, - }), - ); - key = key.next(); - } - } - } + keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + }); + // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look + // into ancestor timelines). TODO: is there any other metadata which we want to inherit? if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { break; }