drop layer map lock before removing droppen popped frozen layer

This is a shot in the dark to fix the pageserver deadlock described in https://github.com/neondatabase/neon/issues/3712 The ASCII art in the comment added in this commit describes a potential deadlock that involves page cache and EphemeralFile. So, I guess it's a useful change by itself. But, I can't find a scenario with the current code base where we would actually arrive in the described deadlock. Let's see whether the deadlock from #3712 still reproduces with this fixx. I can't repro it locally. refs https://github.com/neondatabase/neon/issues/3712
2026-05-30 19:40:39 +00:00 · 2023-03-01 11:16:59 +01:00
5 changed files with 88 additions and 75 deletions
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -37,15 +37,10 @@ pub struct Segment {
    /// LSN at this point
    pub lsn: u64,

-    pub parent_lsn: Option<u64>,
-
-    pub wal_from_parent: Option<u64>,
-
    /// Logical size at this node, if known.
    pub size: Option<u64>,

    /// If true, the segment from parent to this node is needed by `retention_period`
-    /// if wal_from_parent is needed by `retention_period`?
    pub needed: bool,
 }

--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -17,8 +17,6 @@ impl ScenarioBuilder {
        let init_segment = Segment {
            parent: None,
            lsn: 0,
-            parent_lsn: None,
-            wal_from_parent: None,
            size: Some(0),
            needed: false, // determined later
        };
@@ -38,8 +36,6 @@ impl ScenarioBuilder {
        let newseg = Segment {
            parent: Some(lastseg_id),
            lsn: lastseg.lsn + lsn_bytes,
-            parent_lsn: Some(lastseg.lsn),
-            wal_from_parent: Some(lsn_bytes),
            size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
            needed: false,
        };
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -85,16 +85,23 @@ pub struct TimelineInputs {
    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    pub ancestor_id: Option<TimelineId>,

+    #[serde_as(as = "serde_with::DisplayFromStr")]
    ancestor_lsn: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    retention_param_cutoff: Option<Lsn>,
 }

@@ -245,8 +252,6 @@ pub(super) async fn gather_inputs(
            segment: Segment {
                parent: None, // filled in later
                lsn: branch_start_lsn.0,
-                parent_lsn: None,
-                wal_from_parent: None,
                size: None, // filled in later
                needed: branch_start_needed,
            },
@@ -265,8 +270,6 @@ pub(super) async fn gather_inputs(
                segment: Segment {
                    parent: Some(parent),
                    lsn: lsn.0,
-                    parent_lsn: Some(segments[parent].segment.lsn),
-                    wal_from_parent: Some(lsn.0 - segments[parent].segment.lsn),
                    size: None,
                    needed: lsn > next_gc_cutoff,
                },
@@ -281,8 +284,6 @@ pub(super) async fn gather_inputs(
            segment: Segment {
                parent: Some(parent),
                lsn: last_record_lsn.0,
-                parent_lsn: Some(segments[parent].segment.lsn),
-                wal_from_parent: Some(last_record_lsn.0 - segments[parent].segment.lsn),
                size: None, // Filled in later, if necessary
                needed: true,
            },
@@ -612,32 +613,32 @@ fn verify_size_for_multiple_branches() {
  "timeline_inputs": [
    {
      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
-      "ancestor_lsn": 26033560,
-      "last_record": 35851472,
-      "latest_gc_cutoff": 23694408,
-      "horizon_cutoff": 35720400,
-      "pitr_cutoff": 35720400,
-      "next_gc_cutoff": 35720400,
+      "ancestor_lsn": "0/18D3D98",
+      "last_record": "0/2230CD0",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
      "retention_param_cutoff": null
    },
    {
      "timeline_id": "454626700469f0a9914949b9d018e876",
-      "ancestor_lsn": 24566168,
-      "last_record": 25393008,
-      "latest_gc_cutoff": 23694408,
-      "horizon_cutoff": 25261936,
-      "pitr_cutoff": 25261936,
-      "next_gc_cutoff": 25261936,
+      "ancestor_lsn": "0/176D998",
+      "last_record": "0/1837770",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
      "retention_param_cutoff": null
    },
    {
      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
-      "ancestor_lsn": 0,
-      "last_record": 26033560,
-      "latest_gc_cutoff": 23694408,
-      "horizon_cutoff":25902488,
-      "pitr_cutoff":25902488,
-      "next_gc_cutoff":25902488,
+      "ancestor_lsn": "0/0",
+      "last_record": "0/18D3D98",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
      "retention_param_cutoff": null
    }
  ]
@@ -687,13 +688,13 @@ fn verify_size_for_one_branch() {
  "timeline_inputs": [
    {
      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
-      "ancestor_lsn": 0,
-      "last_record": 305547335776,
-      "latest_gc_cutoff": 305614444640,
-      "horizon_cutoff": 305614444640,
-      "pitr_cutoff": 305614444640,
-      "next_gc_cutoff": 305614444640,
-      "retention_param_cutoff": 0
+      "ancestor_lsn": "0/0",
+      "last_record": "47/280A5860",
+      "latest_gc_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
+      "retention_param_cutoff": "0/0"
    }
  ]
 }"#;
@@ -706,15 +707,12 @@ fn verify_size_for_one_branch() {
    println!("result: {:?}", serde_json::to_string(&res.segments));

    use utils::lsn::Lsn;
-    let latest_gc_cutoff_lsn: Lsn = Lsn(305614444640);
-    let last_lsn: Lsn = Lsn(305547335776);
+    let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap();
+    let last_lsn: Lsn = "47/280A5860".parse().unwrap();
    println!(
-        "latest_gc_cutoff lsn {} is {}, last_lsn lsn {} is {}",
-        latest_gc_cutoff_lsn,
+        "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}",
        u64::from(latest_gc_cutoff_lsn),
-        last_lsn,
        u64::from(last_lsn)
    );
-
    assert_eq!(res.total_size, 220121784320);
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2479,7 +2479,7 @@ impl Timeline {

        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now.
-        {
+       {
            let mut layers = self.layers.write().unwrap();
            let l = layers.frozen_layers.pop_front();

@@ -2489,6 +2489,43 @@ impl Timeline {
            assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));

            // release lock on 'layers'
+            drop(layers);
+
+            // Only drop the ephemeral file after releasing the layer map lock.
+            // The reason is that the drop function needs to lock page cache slots.
+            // If there is another thread that is already holding a slot which we
+            // need to lock, and that thread is waiting for the layer map lock, we
+            // would have a deadlock:
+            //
+            //           wants               ┌─────────┐
+            //           us ────────────────►│  cache  │
+            //            ▲                  │  slot   │
+            //            │                  │    │    │
+            //   assigned │                  └────┼────┘
+            //            │                       │
+            //            │                       │assigned
+            //       ┌────┼───┐                   │
+            //       │    │   │                   ▼
+            //       │ layers │◄─────────────── them
+            //       │        │     wants
+            //       └────────┘
+            //
+            // How can this happen? I don't know. Basically we need to walk up
+            // the call graph for all PageCache::try_lock_for_read and PageCache::try_lock_for_write
+            // and check whether any of them holds onto the guard object that these methods return.
+            // The block_io::BlockReader trait implementations look like good candidates.
+            // For example, the BlockReader::read_blk impl for EphemeralFile returns the guard object
+            // for the cache slot straight to the caller.
+            // But, there's are many callers of BlockReader::read_blk, and rust-analyzer has no way
+            // to find just the <EphemeralFile as BlockReader>::read_blk callers.
+            //
+            // One obvious place is InMemoryLayer::get_value_reconstruct_data , which uses a BlockReader::block_cursor.
+            // That cursor object holds onto the most recently read page cache slot until cursor object is dropped.
+            // But, all relevant uses of InMemoryLayer::get_value_reconstruct_data are in Timeline::get_reconstruct_data,
+            // which already holds the layer map lock in shared mode. So, it can't cause this deadlock.
+            //
+            // Maybe this is a red herring?
+            drop(l);
        }

        fail_point!("checkpoint-after-sync");
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -52,26 +52,12 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
    expected_inputs = {
        "segments": [
            {
-                "segment": {
-                    "parent": None,
-                    "lsn": 23694408,
-                    "parent_lsn": None,
-                    "wal_from_parent": None,
-                    "size": 25362432,
-                    "needed": True,
-                },
+                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
                "timeline_id": f"{main_timeline_id}",
                "kind": "BranchStart",
            },
            {
-                "segment": {
-                    "parent": 0,
-                    "lsn": 23694528,
-                    "parent_lsn": 23694408,
-                    "wal_from_parent": 120,
-                    "size": None,
-                    "needed": True,
-                },
+                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
                "timeline_id": f"{main_timeline_id}",
                "kind": "BranchEnd",
            },
@@ -80,17 +66,16 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
            {
                "timeline_id": f"{main_timeline_id}",
                "ancestor_id": None,
-                "ancestor_lsn": 0,
-                "last_record": 23694528,
-                "latest_gc_cutoff": 23694408,
-                "horizon_cutoff": 0,
-                "pitr_cutoff": 0,
-                "next_gc_cutoff": 0,
+                "ancestor_lsn": "0/0",
+                "last_record": "0/1698CC0",
+                "latest_gc_cutoff": "0/1698C48",
+                "horizon_cutoff": "0/0",
+                "pitr_cutoff": "0/0",
+                "next_gc_cutoff": "0/0",
                "retention_param_cutoff": None,
            }
        ],
    }
-
    expected_inputs = mask_model_inputs(expected_inputs)
    actual_inputs = mask_model_inputs(inputs)

@@ -656,11 +641,7 @@ def mask_model_inputs(x):
    if isinstance(x, dict):
        newx = {}
        for k, v in x.items():
-            if (
-                k in ["size", "wal_from_parent", "last_record"]
-                or k.endswith("lsn")
-                or k.endswith("cutoff")
-            ):
+            if k == "size":
                if v is None or v == 0:
                    # no change
                    newx[k] = v
@@ -668,6 +649,12 @@ def mask_model_inputs(x):
                    newx[k] = "<0"
                else:
                    newx[k] = ">0"
+            elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
+                if v is None or v == 0 or v == "0/0":
+                    # no change
+                    newx[k] = v
+                else:
+                    newx[k] = "masked"
            else:
                newx[k] = mask_model_inputs(v)
        return newx