From 7e1996028cd3eb7f45ea48577c02e57bc9eb5f01 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 28 Aug 2023 17:42:57 +0200 Subject: [PATCH] fix the cause of the flaky test failure The following interleaving could lead to the subsequent panic. The fix is to retain the entry in the map if we can't lock the slot, and let `find_victim`'s call to self.remove_mapping() remove the given (file_id, block_no). ``` T1: find_victim up to right before the `self.remove_mapping(old_key);` call T2: `drop_buffers_for_immutable` for the EphemeralFile of `old_key.` => this removes the key from the mapping T1: executes `self.remove_mapping(old_key)`, key already gone => panic ``` ``` ---- walingest::tests::test_large_rel stdout ---- 2023-08-28T10:23:07.361671Z INFO skipping attempt to start flush_loop twice 0921208cfb20e4c120166d07667358e5/11223344556677881122334455667788 thread 'walingest::tests::test_large_rel' panicked at 'could not find old key in mapping', pageserver/src/page_cache.rs:674:22 stack backtrace: 0: rust_begin_unwind at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs:593:5 1: core::panicking::panic_fmt at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:67:14 2: core::panicking::panic_display at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:150:5 3: core::panicking::panic_str at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:134:5 4: core::option::expect_failed at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:1932:5 5: core::option::Option::expect at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:898:21 6: pageserver::page_cache::PageCache::remove_mapping at ./src/page_cache.rs:674:22 7: pageserver::page_cache::PageCache::find_victim at ./src/page_cache.rs:762:21 8: pageserver::page_cache::PageCache::lock_for_read at ./src/page_cache.rs:502:17 9: pageserver::page_cache::PageCache::read_immutable_buf at ./src/page_cache.rs:359:9 10: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}}::Writer::push_bytes::{{closure}} at ./src/tenant/ephemeral_file.rs:136:39 11: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}} at ./src/tenant/ephemeral_file.rs:197:35 12: pageserver::tenant::storage_layer::inmemory_layer::InMemoryLayer::put_value::{{closure}} at ./src/tenant/storage_layer/inmemory_layer.rs:278:41 13: pageserver::tenant::timeline::Timeline::put_value::{{closure}} at ./src/tenant/timeline.rs:2486:40 14: pageserver::tenant::timeline::TimelineWriter::put::{{closure}} at ./src/tenant/timeline.rs:4618:44 15: pageserver::pgdatadir_mapping::DatadirModification::commit::{{closure}} at ./src/pgdatadir_mapping.rs:1184:42 16: pageserver::walingest::tests::test_large_rel::{{closure}} at ./src/walingest.rs:1661:24 ``` --- pageserver/src/page_cache.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 4aa8ae9139..af7f282a1d 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -446,6 +446,7 @@ impl PageCache { self.size_metrics.current_bytes_immutable.sub_page_sz(1); inner.key = None; } + return false; } TryLockResult::Err(e @ TryLockError::Poisoned(_)) => { panic!("slot lock {slot_idx} poisoned: {e:?}") @@ -461,10 +462,13 @@ impl PageCache { // trying to claim this page. // // Avoid waiting and keep the page in the cache. - // Sooner or later it'll become a victim and get evicted. + // The `find_victim` will free up the slot. + // Also, it will call `remove_mapping()` to remove the (file_id, block_no) + // from the immutable_page_map; so, keep the entry in the map here, otherwise + // the `remove_mapping()` call will panic. + return true; } } - return false; }); }