From babd2339cc5057e23bdc72f85a91fe17ceec2353 Mon Sep 17 00:00:00 2001 From: anastasia Date: Thu, 22 Apr 2021 15:51:44 +0300 Subject: [PATCH] [issue #56] Fix race at postgres instance + walreceiver start. Uses postgres/vendor issue_56 branch. TODO: rebase on main --- integration_tests/tests/test_pageserver.rs | 4 --- pageserver/src/page_cache.rs | 32 ++++++++++++---------- pageserver/src/restore_local_repo.rs | 2 +- pageserver/src/walreceiver.rs | 2 +- vendor/postgres | 2 +- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 57ad0c186f..0d0d589c56 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -3,7 +3,6 @@ use control_plane::compute::ComputeControlPlane; use control_plane::local_env; use control_plane::local_env::PointInTime; use control_plane::storage::TestStorageControlPlane; -use std::{thread, time}; // XXX: force all redo at the end // -- restart + seqscan won't read deleted stuff @@ -97,9 +96,6 @@ fn test_pageserver_two_timelines() { node1.start().unwrap(); node2.start().unwrap(); - //give walreceiver time to connect - thread::sleep(time::Duration::from_secs(3)); - // check node1 node1.safe_psql( "postgres", diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 7064467067..3849785a42 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -106,7 +106,6 @@ struct PageCacheShared { first_valid_lsn: u64, last_valid_lsn: u64, last_record_lsn: u64, - walreceiver_works: bool, } lazy_static! { @@ -170,7 +169,6 @@ fn init_page_cache() -> PageCache { first_valid_lsn: 0, last_valid_lsn: 0, last_record_lsn: 0, - walreceiver_works: false, }), valid_lsn_condvar: Condvar::new(), @@ -276,9 +274,17 @@ impl PageCache { // // Returns an 8k page image // - pub fn get_page_at_lsn(&self, tag: BufferTag, lsn: u64) -> anyhow::Result { + pub fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: u64) -> anyhow::Result { self.num_getpage_requests.fetch_add(1, Ordering::Relaxed); + let mut lsn = req_lsn; + //When invalid LSN is requested, it means "don't wait, return latest version of the page" + //This is necessary for bootstrap. + //TODO should we use last_valid_lsn here instead of maxvalue? + if lsn == 0 + { + lsn = 0xffff_ffff_ffff_eeee; + } // Look up cache entry. If it's a page image, return that. If it's a WAL record, // ask the WAL redo service to reconstruct the page image from the WAL records. let minkey = CacheKey { tag, lsn: 0 }; @@ -292,16 +298,15 @@ impl PageCache { // There is a a race at postgres instance start // when we request a page before walsender established connection // and was able to stream the page. Just don't wait and return what we have. - // TODO is there any corner case when this is incorrect? - if !shared.walreceiver_works { + if req_lsn == 0 + { trace!( - " walreceiver doesn't work yet last_valid_lsn {}, requested {}", - shared.last_valid_lsn, - lsn - ); + "walsender hasn't started yet. Don't wait. last_valid_lsn {}, requested {}", + shared.last_valid_lsn, lsn); } - if shared.walreceiver_works { + if req_lsn != 0 + { while lsn > shared.last_valid_lsn { // TODO: Wait for the WAL receiver to catch up waited = true; @@ -325,6 +330,7 @@ impl PageCache { } } } + if waited { trace!("caught up now, continuing"); } @@ -532,16 +538,12 @@ impl PageCache { } // - pub fn advance_last_valid_lsn(&self, lsn: u64, from_walreceiver: bool) { + pub fn advance_last_valid_lsn(&self, lsn: u64) { let mut shared = self.shared.lock().unwrap(); // Can't move backwards. let oldlsn = shared.last_valid_lsn; if lsn >= oldlsn { - // Now we receive entries from walreceiver and should wait - if from_walreceiver { - shared.walreceiver_works = true; - } shared.last_valid_lsn = lsn; self.valid_lsn_condvar.notify_all(); diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index 5c39d805f6..aabb299280 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -325,7 +325,7 @@ fn restore_wal( } // Now that this record has been handled, let the page cache know that // it is up-to-date to this LSN - pcache.advance_last_valid_lsn(lsn, false); + pcache.advance_last_valid_lsn(lsn); last_lsn = lsn; } else { break; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 3e72b5e747..149d02e0cc 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -252,7 +252,7 @@ async fn walreceiver_main( // better reflect that, because GetPage@LSN requests might also point in the // middle of a record, if the request LSN was taken from the server's current // flush ptr. - pcache.advance_last_valid_lsn(endlsn, true); + pcache.advance_last_valid_lsn(endlsn); if !caught_up && endlsn >= end_of_wal { info!( diff --git a/vendor/postgres b/vendor/postgres index 2a9f68e665..67da6b1df6 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 2a9f68e665507420475f2a2fa3d1563dfc5502f3 +Subproject commit 67da6b1df6523ea325db33f2942e965d3ed5aa26