mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
fix the cause of the flaky test failure
The following interleaving could lead to the subsequent panic.
The fix is to retain the entry in the map if we can't lock the slot, and
let `find_victim`'s call to self.remove_mapping() remove the given
(file_id, block_no).
```
T1: find_victim up to right before the `self.remove_mapping(old_key);` call
T2: `drop_buffers_for_immutable` for the EphemeralFile of `old_key.`
=> this removes the key from the mapping
T1: executes `self.remove_mapping(old_key)`, key already gone => panic
```
```
---- walingest::tests::test_large_rel stdout ----
2023-08-28T10:23:07.361671Z INFO skipping attempt to start flush_loop twice 0921208cfb20e4c120166d07667358e5/11223344556677881122334455667788
thread 'walingest::tests::test_large_rel' panicked at 'could not find old key in mapping', pageserver/src/page_cache.rs:674:22
stack backtrace:
0: rust_begin_unwind
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs:593:5
1: core::panicking::panic_fmt
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:67:14
2: core::panicking::panic_display
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:150:5
3: core::panicking::panic_str
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:134:5
4: core::option::expect_failed
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:1932:5
5: core::option::Option<T>::expect
at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:898:21
6: pageserver::page_cache::PageCache::remove_mapping
at ./src/page_cache.rs:674:22
7: pageserver::page_cache::PageCache::find_victim
at ./src/page_cache.rs:762:21
8: pageserver::page_cache::PageCache::lock_for_read
at ./src/page_cache.rs:502:17
9: pageserver::page_cache::PageCache::read_immutable_buf
at ./src/page_cache.rs:359:9
10: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}}::Writer::push_bytes::{{closure}}
at ./src/tenant/ephemeral_file.rs:136:39
11: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}}
at ./src/tenant/ephemeral_file.rs:197:35
12: pageserver::tenant::storage_layer::inmemory_layer::InMemoryLayer::put_value::{{closure}}
at ./src/tenant/storage_layer/inmemory_layer.rs:278:41
13: pageserver::tenant::timeline::Timeline::put_value::{{closure}}
at ./src/tenant/timeline.rs:2486:40
14: pageserver::tenant::timeline::TimelineWriter::put::{{closure}}
at ./src/tenant/timeline.rs:4618:44
15: pageserver::pgdatadir_mapping::DatadirModification::commit::{{closure}}
at ./src/pgdatadir_mapping.rs:1184:42
16: pageserver::walingest::tests::test_large_rel::{{closure}}
at ./src/walingest.rs:1661:24
```
This commit is contained in:
@@ -446,6 +446,7 @@ impl PageCache {
|
||||
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
|
||||
inner.key = None;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
TryLockResult::Err(e @ TryLockError::Poisoned(_)) => {
|
||||
panic!("slot lock {slot_idx} poisoned: {e:?}")
|
||||
@@ -461,10 +462,13 @@ impl PageCache {
|
||||
// trying to claim this page.
|
||||
//
|
||||
// Avoid waiting and keep the page in the cache.
|
||||
// Sooner or later it'll become a victim and get evicted.
|
||||
// The `find_victim` will free up the slot.
|
||||
// Also, it will call `remove_mapping()` to remove the (file_id, block_no)
|
||||
// from the immutable_page_map; so, keep the entry in the map here, otherwise
|
||||
// the `remove_mapping()` call will panic.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user