From f0573f5991d99214e45f55308bf6b873394ae8e4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 13 Mar 2023 14:07:35 +0200
Subject: [PATCH] Remove block cursor cache (#3740)

## Describe your changes

Do not pin current block in BlockCursor

## Issue ticket number and link

See #3712

There are places (see get_reconstruct_data) in our code when thread is
holding read layers lock and then try to read file and so lock page
cache slot. So we have edge in dependency graph layers->page cache slot.
At the same time (as Christian noticed) we can lock page cache slot in
BlockCursor and then try obtain shard lock on layers.
So there is backward edge in dependency graph page cache slot>layers
which forms loop and may cause deadlock.

There are three possible fixes of the problem:
1. Perform compaction under `layers` shared lock. See PR #3732. It fixes
the problem but make it not possible to append any data to pageserver
until compaction is completed.
2. Do not hold `layers` lock while accessing layers (not sure if it is
possible to do because it definitely introduce some new race
conditions).
3. Do not pin current pages in BockCursor (this PR).

My experiments shows that this cache in BlockCursor is not so useful:
the number of hits/misses for cursor cache on pgbench workload (-i -s
10/-c 10 -T 100/-c 10 -S -T 100):
```
hits: 163011
misses: 1023602
```
So number of cache misses is 10x times  larger.
And results for read-only pgbench are mostly the same:
```
with cache: 14581
w/out cache: 14429
```

## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 pageserver/src/tenant/block_io.rs       | 38 ++-----------------------
 pageserver/src/tenant/ephemeral_file.rs |  5 +---
 2 files changed, 4 insertions(+), 39 deletions(-)
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index e3cc800447..10de34e3f6 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -51,9 +51,6 @@ where
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
-/// A cursor caches the last accessed page, allowing for faster access if the
-/// same block is accessed repeatedly.
-///
 /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so
 /// that in many cases you can use a BlockCursor as a drop-in replacement for
 /// the underlying BlockReader. For example:
@@ -73,8 +70,6 @@ where
     R: BlockReader,
 {
     reader: R,
-    /// last accessed page
-    cache: Option<(u32, R::BlockLease)>,
 }
 
 impl<R> BlockCursor<R>
@@ -82,40 +77,13 @@ where
     R: BlockReader,
 {
     pub fn new(reader: R) -> Self {
-        BlockCursor {
-            reader,
-            cache: None,
-        }
+        BlockCursor { reader }
     }
 
-    pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> {
-        // Fast return if this is the same block as before
-        if let Some((cached_blk, _buf)) = &self.cache {
-            if *cached_blk == blknum {
-                return Ok(self);
-            }
-        }
-
-        // Read the block from the underlying reader, and cache it
-        self.cache = None;
-        let buf = self.reader.read_blk(blknum)?;
-        self.cache = Some((blknum, buf));
-
-        Ok(self)
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum)
     }
 }
-
-impl<R> Deref for BlockCursor<R>
-where
-    R: BlockReader,
-{
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &<Self as Deref>::Target {
-        &self.cache.as_ref().unwrap().1
-    }
-}
-
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);
 
 /// An adapter for reading a (virtual) file using the page cache.
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index c433e65ad2..4379438896 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,9 +2,7 @@
 //! used to keep in-memory layers spilled on disk.
 
 use crate::config::PageServerConf;
-use crate::page_cache;
-use crate::page_cache::PAGE_SZ;
-use crate::page_cache::{ReadBufResult, WriteBufResult};
+use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::virtual_file::VirtualFile;
@@ -427,7 +425,6 @@ mod tests {
             let actual = cursor.read_blob(pos)?;
             assert_eq!(actual, expected);
         }
-        drop(cursor);
 
         // Test a large blob that spans multiple pages
         let mut large_data = Vec::new();