From 2fd4c390cbfccca419042a4104178cd91ff396ba Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 10 Jan 2022 14:41:15 +0300 Subject: [PATCH] Do not hold timelines lock during GC (#1089) * Do not hold timelines lock during GC refer #1087 * Add gc_cs mutex for preveting creation of new timelines during GC * Make clippy happy * Use Mutex<()> instead of Mutex for GC critical section --- pageserver/src/layered_repository.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c6e694f607..9064c7d173 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -127,7 +127,13 @@ pub struct LayeredRepository { conf: &'static PageServerConf, tenantid: ZTenantId, timelines: Mutex>, - + // This mutex prevents creation of new timelines during GC. + // Adding yet another mutex (in addition to `timelines`) is needed because holding + // `timelines` mutex during all GC iteration (especially with enforced checkpoint) + // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations + // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn + // timeout... + gc_cs: Mutex<()>, walredo_mgr: Arc, /// Makes every timeline to backup their files to remote storage. upload_relishes: bool, @@ -186,6 +192,8 @@ impl Repository for LayeredRepository { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + let mut timelines = self.timelines.lock().unwrap(); let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? { LayeredTimelineEntry::Local(timeline) => timeline, @@ -489,6 +497,7 @@ impl LayeredRepository { tenantid, conf, timelines: Mutex::new(HashMap::new()), + gc_cs: Mutex::new(()), walredo_mgr, upload_relishes, } @@ -575,7 +584,8 @@ impl LayeredRepository { let now = Instant::now(); // grab mutex to prevent new timelines from being created here. - // TODO: We will hold it for a long time + let _gc_cs = self.gc_cs.lock().unwrap(); + let mut timelines = self.timelines.lock().unwrap(); // Scan all timelines. For each timeline, remember the timeline ID and @@ -612,6 +622,7 @@ impl LayeredRepository { }; if let Some(ancestor_timeline) = &timeline.ancestor_timeline { + drop(timelines); let ancestor_timeline = match ancestor_timeline.local_or_schedule_download(self.tenantid) { Some(timeline) => timeline, @@ -635,6 +646,7 @@ impl LayeredRepository { else { all_branchpoints.insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); } + timelines = self.timelines.lock().unwrap(); } }