From e69ff3fc00ab8be31e8f69eb3726da1b83d84180 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 22 Apr 2024 19:40:08 +0300 Subject: [PATCH] Refactor updating relation size cache on reads (#7376) Instead of trusting that a request with latest == true means that the request LSN was at least last_record_lsn, remember explicitly when the relation cache was initialized. Incidentally, this allows updating the relation size cache also on reads from read-only endpoints, when the endpoint is at a relatively recent LSN (more recent than the end of the timeline when the timeline was loaded in the pageserver). Add a comment to wait_or_get_last_lsn() that it might be better to use an older LSN when possible. Note that doing that would be unsafe, without the relation cache changes in this commit! --- pageserver/src/page_service.rs | 5 +++++ pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++++-------------- pageserver/src/tenant/timeline.rs | 17 +++++++++++++++-- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3b9a30ba4c..62782d8dd3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -874,6 +874,11 @@ impl PageServerHandler { // walsender completes the authentication and starts streaming the // WAL. if lsn <= last_record_lsn { + // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead + // last_record_lsn. That would give the same result, since we know + // that there haven't been modifications since 'lsn'. Using an older + // LSN might be faster, because that could allow skipping recent + // layers when finding the page. lsn = last_record_lsn; } else { timeline diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 351a766b10..4a9682dcac 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -252,16 +252,8 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - if latest { - // Update relation size cache only if "latest" flag is set. - // This flag is set by compute when it is working with most recent version of relation. - // Typically master compute node always set latest=true. - // Please notice, that even if compute node "by mistake" specifies old LSN but set - // latest=true, then it can not cause cache corruption, because with latest=true - // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be - // associated with most recent value of LSN. - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); - } + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + Ok(nblocks) } @@ -817,7 +809,7 @@ impl Timeline { /// Get cached size of relation if it not updated after specified LSN pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { if lsn >= *cached_lsn { return Some(*nblocks); } @@ -828,7 +820,16 @@ impl Timeline { /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { + + if lsn < rel_size_cache.complete_as_of { + // Do not cache old values. It's safe to cache the size on read, as long as + // the read was at an LSN since we started the WAL ingestion. Reasoning: we + // never evict values from the cache, so if the relation size changed after + // 'lsn', the new value is already in the cache. + return; + } + + match rel_size_cache.map.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { @@ -844,13 +845,13 @@ impl Timeline { /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); + rel_size_cache.map.insert(tag, (lsn, nblocks)); } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); + rel_size_cache.map.remove(tag); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e707c3b244..fa7d219fb0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState { pub(crate) n_deltas: usize, } +/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL +/// ingestion considerably, because WAL ingestion needs to check on most records if the record +/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end +/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the +/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. +pub(crate) struct RelSizeCache { + pub(crate) complete_as_of: Lsn, + pub(crate) map: HashMap, +} + pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -324,7 +334,7 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub rel_size_cache: RwLock>, + pub(crate) rel_size_cache: RwLock, download_all_remote_layers_task_info: RwLock>, @@ -1951,7 +1961,10 @@ impl Timeline { last_image_layer_creation_check_at: AtomicLsn::new(0), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), + rel_size_cache: RwLock::new(RelSizeCache { + complete_as_of: disk_consistent_lsn, + map: HashMap::new(), + }), download_all_remote_layers_task_info: RwLock::new(None),