Compare commits

..

1 Commits

Author SHA1 Message Date
Christian Schwarz
434af3771d drop layer map lock before removing droppen popped frozen layer
This is a shot in the dark to fix the pageserver deadlock
described in https://github.com/neondatabase/neon/issues/3712

The ASCII art in the comment added in this commit describes a potential
deadlock that involves page cache and EphemeralFile.
So, I guess it's a useful change by itself.

But, I can't find a scenario with the current code base where
we would actually arrive in the described deadlock.

Let's see whether the deadlock from #3712 still reproduces with this
fixx. I can't repro it locally.

refs https://github.com/neondatabase/neon/issues/3712
2023-03-01 11:16:59 +01:00
5 changed files with 88 additions and 75 deletions

View File

@@ -37,15 +37,10 @@ pub struct Segment {
/// LSN at this point
pub lsn: u64,
pub parent_lsn: Option<u64>,
pub wal_from_parent: Option<u64>,
/// Logical size at this node, if known.
pub size: Option<u64>,
/// If true, the segment from parent to this node is needed by `retention_period`
/// if wal_from_parent is needed by `retention_period`?
pub needed: bool,
}

View File

@@ -17,8 +17,6 @@ impl ScenarioBuilder {
let init_segment = Segment {
parent: None,
lsn: 0,
parent_lsn: None,
wal_from_parent: None,
size: Some(0),
needed: false, // determined later
};
@@ -38,8 +36,6 @@ impl ScenarioBuilder {
let newseg = Segment {
parent: Some(lastseg_id),
lsn: lastseg.lsn + lsn_bytes,
parent_lsn: Some(lastseg.lsn),
wal_from_parent: Some(lsn_bytes),
size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
needed: false,
};

View File

@@ -85,16 +85,23 @@ pub struct TimelineInputs {
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
pub ancestor_id: Option<TimelineId>,
#[serde_as(as = "serde_with::DisplayFromStr")]
ancestor_lsn: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
last_record: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
latest_gc_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
horizon_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
pitr_cutoff: Lsn,
/// Cutoff point based on GC settings
#[serde_as(as = "serde_with::DisplayFromStr")]
next_gc_cutoff: Lsn,
/// Cutoff point calculated from the user-supplied 'max_retention_period'
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
retention_param_cutoff: Option<Lsn>,
}
@@ -245,8 +252,6 @@ pub(super) async fn gather_inputs(
segment: Segment {
parent: None, // filled in later
lsn: branch_start_lsn.0,
parent_lsn: None,
wal_from_parent: None,
size: None, // filled in later
needed: branch_start_needed,
},
@@ -265,8 +270,6 @@ pub(super) async fn gather_inputs(
segment: Segment {
parent: Some(parent),
lsn: lsn.0,
parent_lsn: Some(segments[parent].segment.lsn),
wal_from_parent: Some(lsn.0 - segments[parent].segment.lsn),
size: None,
needed: lsn > next_gc_cutoff,
},
@@ -281,8 +284,6 @@ pub(super) async fn gather_inputs(
segment: Segment {
parent: Some(parent),
lsn: last_record_lsn.0,
parent_lsn: Some(segments[parent].segment.lsn),
wal_from_parent: Some(last_record_lsn.0 - segments[parent].segment.lsn),
size: None, // Filled in later, if necessary
needed: true,
},
@@ -612,32 +613,32 @@ fn verify_size_for_multiple_branches() {
"timeline_inputs": [
{
"timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
"ancestor_lsn": 26033560,
"last_record": 35851472,
"latest_gc_cutoff": 23694408,
"horizon_cutoff": 35720400,
"pitr_cutoff": 35720400,
"next_gc_cutoff": 35720400,
"ancestor_lsn": "0/18D3D98",
"last_record": "0/2230CD0",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/2210CD0",
"pitr_cutoff": "0/2210CD0",
"next_gc_cutoff": "0/2210CD0",
"retention_param_cutoff": null
},
{
"timeline_id": "454626700469f0a9914949b9d018e876",
"ancestor_lsn": 24566168,
"last_record": 25393008,
"latest_gc_cutoff": 23694408,
"horizon_cutoff": 25261936,
"pitr_cutoff": 25261936,
"next_gc_cutoff": 25261936,
"ancestor_lsn": "0/176D998",
"last_record": "0/1837770",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/1817770",
"pitr_cutoff": "0/1817770",
"next_gc_cutoff": "0/1817770",
"retention_param_cutoff": null
},
{
"timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
"ancestor_lsn": 0,
"last_record": 26033560,
"latest_gc_cutoff": 23694408,
"horizon_cutoff":25902488,
"pitr_cutoff":25902488,
"next_gc_cutoff":25902488,
"ancestor_lsn": "0/0",
"last_record": "0/18D3D98",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/18B3D98",
"pitr_cutoff": "0/18B3D98",
"next_gc_cutoff": "0/18B3D98",
"retention_param_cutoff": null
}
]
@@ -687,13 +688,13 @@ fn verify_size_for_one_branch() {
"timeline_inputs": [
{
"timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
"ancestor_lsn": 0,
"last_record": 305547335776,
"latest_gc_cutoff": 305614444640,
"horizon_cutoff": 305614444640,
"pitr_cutoff": 305614444640,
"next_gc_cutoff": 305614444640,
"retention_param_cutoff": 0
"ancestor_lsn": "0/0",
"last_record": "47/280A5860",
"latest_gc_cutoff": "47/240A5860",
"horizon_cutoff": "47/240A5860",
"pitr_cutoff": "47/240A5860",
"next_gc_cutoff": "47/240A5860",
"retention_param_cutoff": "0/0"
}
]
}"#;
@@ -706,15 +707,12 @@ fn verify_size_for_one_branch() {
println!("result: {:?}", serde_json::to_string(&res.segments));
use utils::lsn::Lsn;
let latest_gc_cutoff_lsn: Lsn = Lsn(305614444640);
let last_lsn: Lsn = Lsn(305547335776);
let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap();
let last_lsn: Lsn = "47/280A5860".parse().unwrap();
println!(
"latest_gc_cutoff lsn {} is {}, last_lsn lsn {} is {}",
latest_gc_cutoff_lsn,
"latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}",
u64::from(latest_gc_cutoff_lsn),
last_lsn,
u64::from(last_lsn)
);
assert_eq!(res.total_size, 220121784320);
}

View File

@@ -2479,7 +2479,7 @@ impl Timeline {
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now.
{
{
let mut layers = self.layers.write().unwrap();
let l = layers.frozen_layers.pop_front();
@@ -2489,6 +2489,43 @@ impl Timeline {
assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
// release lock on 'layers'
drop(layers);
// Only drop the ephemeral file after releasing the layer map lock.
// The reason is that the drop function needs to lock page cache slots.
// If there is another thread that is already holding a slot which we
// need to lock, and that thread is waiting for the layer map lock, we
// would have a deadlock:
//
// wants ┌─────────┐
// us ────────────────►│ cache │
// ▲ │ slot │
// │ │ │ │
// assigned │ └────┼────┘
// │ │
// │ │assigned
// ┌────┼───┐ │
// │ │ │ ▼
// │ layers │◄─────────────── them
// │ │ wants
// └────────┘
//
// How can this happen? I don't know. Basically we need to walk up
// the call graph for all PageCache::try_lock_for_read and PageCache::try_lock_for_write
// and check whether any of them holds onto the guard object that these methods return.
// The block_io::BlockReader trait implementations look like good candidates.
// For example, the BlockReader::read_blk impl for EphemeralFile returns the guard object
// for the cache slot straight to the caller.
// But, there's are many callers of BlockReader::read_blk, and rust-analyzer has no way
// to find just the <EphemeralFile as BlockReader>::read_blk callers.
//
// One obvious place is InMemoryLayer::get_value_reconstruct_data , which uses a BlockReader::block_cursor.
// That cursor object holds onto the most recently read page cache slot until cursor object is dropped.
// But, all relevant uses of InMemoryLayer::get_value_reconstruct_data are in Timeline::get_reconstruct_data,
// which already holds the layer map lock in shared mode. So, it can't cause this deadlock.
//
// Maybe this is a red herring?
drop(l);
}
fail_point!("checkpoint-after-sync");

View File

@@ -52,26 +52,12 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
expected_inputs = {
"segments": [
{
"segment": {
"parent": None,
"lsn": 23694408,
"parent_lsn": None,
"wal_from_parent": None,
"size": 25362432,
"needed": True,
},
"segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
"timeline_id": f"{main_timeline_id}",
"kind": "BranchStart",
},
{
"segment": {
"parent": 0,
"lsn": 23694528,
"parent_lsn": 23694408,
"wal_from_parent": 120,
"size": None,
"needed": True,
},
"segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
"timeline_id": f"{main_timeline_id}",
"kind": "BranchEnd",
},
@@ -80,17 +66,16 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
{
"timeline_id": f"{main_timeline_id}",
"ancestor_id": None,
"ancestor_lsn": 0,
"last_record": 23694528,
"latest_gc_cutoff": 23694408,
"horizon_cutoff": 0,
"pitr_cutoff": 0,
"next_gc_cutoff": 0,
"ancestor_lsn": "0/0",
"last_record": "0/1698CC0",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/0",
"pitr_cutoff": "0/0",
"next_gc_cutoff": "0/0",
"retention_param_cutoff": None,
}
],
}
expected_inputs = mask_model_inputs(expected_inputs)
actual_inputs = mask_model_inputs(inputs)
@@ -656,11 +641,7 @@ def mask_model_inputs(x):
if isinstance(x, dict):
newx = {}
for k, v in x.items():
if (
k in ["size", "wal_from_parent", "last_record"]
or k.endswith("lsn")
or k.endswith("cutoff")
):
if k == "size":
if v is None or v == 0:
# no change
newx[k] = v
@@ -668,6 +649,12 @@ def mask_model_inputs(x):
newx[k] = "<0"
else:
newx[k] = ">0"
elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
if v is None or v == 0 or v == "0/0":
# no change
newx[k] = v
else:
newx[k] = "masked"
else:
newx[k] = mask_model_inputs(v)
return newx