fix the cause of the flaky test failure

The following interleaving could lead to the subsequent panic.
The fix is to retain the entry in the map if we can't lock the slot, and
let `find_victim`'s call to self.remove_mapping() remove the given
(file_id, block_no).

```
T1: find_victim up to right before the `self.remove_mapping(old_key);`  call
T2: `drop_buffers_for_immutable` for the EphemeralFile of `old_key.`
    => this removes the key from the mapping
T1: executes `self.remove_mapping(old_key)`, key already gone => panic
```

    ```

    ---- walingest::tests::test_large_rel stdout ----
    2023-08-28T10:23:07.361671Z  INFO skipping attempt to start flush_loop twice 0921208cfb20e4c120166d07667358e5/11223344556677881122334455667788
    thread 'walingest::tests::test_large_rel' panicked at 'could not find old key in mapping', pageserver/src/page_cache.rs:674:22
    stack backtrace:
       0: rust_begin_unwind
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs:593:5
       1: core::panicking::panic_fmt
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:67:14
       2: core::panicking::panic_display
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:150:5
       3: core::panicking::panic_str
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/panicking.rs:134:5
       4: core::option::expect_failed
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:1932:5
       5: core::option::Option<T>::expect
                 at /rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/option.rs:898:21
       6: pageserver::page_cache::PageCache::remove_mapping
                 at ./src/page_cache.rs:674:22
       7: pageserver::page_cache::PageCache::find_victim
                 at ./src/page_cache.rs:762:21
       8: pageserver::page_cache::PageCache::lock_for_read
                 at ./src/page_cache.rs:502:17
       9: pageserver::page_cache::PageCache::read_immutable_buf
                 at ./src/page_cache.rs:359:9
      10: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}}::Writer::push_bytes::{{closure}}
                 at ./src/tenant/ephemeral_file.rs:136:39
      11: pageserver::tenant::ephemeral_file::EphemeralFile::write_blob::{{closure}}
                 at ./src/tenant/ephemeral_file.rs:197:35
      12: pageserver::tenant::storage_layer::inmemory_layer::InMemoryLayer::put_value::{{closure}}
                 at ./src/tenant/storage_layer/inmemory_layer.rs:278:41
      13: pageserver::tenant::timeline::Timeline::put_value::{{closure}}
                 at ./src/tenant/timeline.rs:2486:40
      14: pageserver::tenant::timeline::TimelineWriter::put::{{closure}}
                 at ./src/tenant/timeline.rs:4618:44
      15: pageserver::pgdatadir_mapping::DatadirModification::commit::{{closure}}
                 at ./src/pgdatadir_mapping.rs:1184:42
      16: pageserver::walingest::tests::test_large_rel::{{closure}}
                 at ./src/walingest.rs:1661:24
    ```
This commit is contained in:
Christian Schwarz
2023-08-28 17:42:57 +02:00
parent c60d6c6e64
commit 7e1996028c

View File

@@ -446,6 +446,7 @@ impl PageCache {
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
inner.key = None;
}
return false;
}
TryLockResult::Err(e @ TryLockError::Poisoned(_)) => {
panic!("slot lock {slot_idx} poisoned: {e:?}")
@@ -461,10 +462,13 @@ impl PageCache {
// trying to claim this page.
//
// Avoid waiting and keep the page in the cache.
// Sooner or later it'll become a victim and get evicted.
// The `find_victim` will free up the slot.
// Also, it will call `remove_mapping()` to remove the (file_id, block_no)
// from the immutable_page_map; so, keep the entry in the map here, otherwise
// the `remove_mapping()` call will panic.
return true;
}
}
return false;
});
}