From 78ca8679e8a810b5e33b6f327da023325fd8bab3 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 25 Mar 2024 15:32:47 +0000 Subject: [PATCH] doc: comment why this is not so easy --- pageserver/src/tenant/timeline/compaction.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6168d7869a..8d52fa68b7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -263,6 +263,22 @@ impl Timeline { } } + // drop the readlock for now; in theory, gc could also remove the same layers as we are now + // compacting. FIXME: how to prepare such a test case? + // 0. tenant with minimal pitr + // 1. create 10 layers + // 2. await on pausable_failpoint after dropping the read guard + // 3. delete all data, vacuum, checkpoint + // 4. gc + // + // now gc deletes the layers and when we finally get to writing our results back in + // finish_compact_batch, LayerManager::finish_compact_l0 will panic when deleting a layer + // which does not exist in LayerMap::remove_historic_noflush or LayerFileManager::remove. + // + // is the easy solution just to make the deletions from compaction more lenient? currently + // gc holds a write lock, so it cannot have this problem right now. if gc were to be loosened to take the + // read lock only momentarily and write lock for applying, it would have a similar issue in + // trying to gc layers which have just been compacted. stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); drop_rlock(guard);