improve comments in identify_levels

2026-01-16 01:42:55 +00:00 · 2024-05-16 12:25:21 +03:00
parent 6593ce05b8
commit 272416a29c
1 changed files with 77 additions and 15 deletions
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -82,14 +82,53 @@ where
        return Ok(None);
    }

-    // Walk the ranges in LSN order.
+    // Our goal is to find an LSN boundary such that:
    //
-    // ----- end_lsn
-    //  |
-    //  |
-    //  v
+    // - There are no layers that overlap with the LSN. In other words, the LSN neatly
+    //   divides the layers into two sets: those that are above it, and those that are
+    //   below it.
    //
-    layers.sort_by_key(|l| l.lsn_range().end); // this isn't deterministic, can have multiple layers with same LSN range but different key dimension
+    // - All the layers above it have LSN range smaller than lsn_max_size
+    //
+    // Our strategy is to walk through the layers ordered by end LSNs, and maintain two
+    // values:
+    //
+    // - The current best LSN boundary. We have already determined that there are no
+    //   layers that overlap with it.
+    //
+    // - Candidate LSN boundary. We have not found any layers that overlap with this LSN
+    //   yet, but we might still see one later, as we continue iterating.
+    //
+    // Whenever we find that there are no layers that overlap with the current candidate,
+    // we make that the new "current best" and start building a new candidate. We start
+    // with the caller-supplied `end_lsn` as the current best LSN boundary. All the layers
+    // are below that LSN, so that's a valid stopping point. If we see a layer that is too
+    // large, we discard the candidate that we were building and return with the "current
+    // best" that we have seen so far.
+    //
+    // Here's an illustration of the state after processing layers A, B, C, and D:
+    //
+    //            A   B   C   D   E
+    // 10000                           <-- end_lsn passed by caller
+    //  9000      +
+    //  8000      |   +   +
+    //  7000      +   |   |
+    //  6000          |   |
+    //  5000          |   +
+    //  4000          +
+    //  3000                  +          <-- current_best_start_lsn
+    //  3000                  |    +
+    //  2000                  |    |
+    //  2000                  +    |     <--- candidate_start_lsn
+    //  1000                       +
+    //
+    // When we saw layer D with end_lsn==3000, and had not seen any layers that cross that
+    // LSN, we knew that that is a valid stopping point. It became the new
+    // current_best_start_lsn, and the layer's start_lsn (2000) became the new
+    // candidate_start_lsn. Because layer E overlaps with it, it is not a valid return
+    // value, but we will not know that until processing layer E.
+    //
+    layers.sort_by_key(|l| l.lsn_range().end);
    let mut candidate_start_lsn = end_lsn;
    let mut candidate_layers: Vec<L> = Vec::new();
    let mut current_best_start_lsn = end_lsn;
@@ -134,8 +173,8 @@ where

            // Is it small enough to be considered part of this level?
            if r.end.0 - r.start.0 > lsn_max_size {
-                // Too large, this layer belongs to next level. Stop.
-                // Due to the sorting bug pointed out above there could still be smaller layers at same key range
+                // Too large, this layer belongs to next level. Discard the candidate
+                // we were building and return with `current_best`.
                trace!(
                    "too large {}, size {} vs {}",
                    l.short_id(),
@@ -310,13 +349,11 @@ mod tests {
    }

    #[tokio::test]
-    async fn repro_identify_levels_bails_too_ealy_if_partitioned_keyspace_same_lsn() -> anyhow::Result<()> {
+    async fn test_disjoint_key_ranges_same_lsn() -> anyhow::Result<()> {
        tracing_subscriber::fmt::init(); // so that RUST_LOG=trace works
-        // It's sorting by end_lsn only, so, this `vec![]` is already sorted.
+
+        // `identify_levels` sorts by end_lsn only, so, this `vec![]` is already sorted.
        // (The sort implementation may shuffle the elements anyway, but, let's assume it doesn't.)
-        // The `identify_levels` loop will bails out at the first layer that is too large.
-        // , i.e., layer B. (log message "too large").
-        // That leaves layer C out of the level, even though it belongs to it.
        let max_size = 0x2000;
        let layers = vec![
            delta(0x1000..0x2000, Lsn(0x8000)..Lsn(0x9000)), // A
@@ -324,8 +361,33 @@ mod tests {
            delta(0x3000..0x4000, Lsn(0x8000)..Lsn(0x9000)), // C
        ];

-        let level = identify_level(layers.clone(), Lsn(0x10_000), max_size).await?.unwrap();
-        assert_eq!(level.layers.len(), 3);
+        // The layers are processed in order A, B, C
+        //
+        //                                A           B            C
+        //                       10000
+        //
+        //                        9000    +           +            +
+        //                                |           |            |
+        //                        8000    +           |            +
+        //                        7000                |
+        //                        6000                |
+        //                        5000                +
+        //                        4000
+        //
+        // current_best_start_lsn: 10000    9000
+        // current_best_layers:       []      []
+        // candidate_start_lsn:    10000    8000
+        // candidate_layers:          []     [A]
+        //
+        // A: Hooray, there are no crossing LSNs.
+        // B: Too large. Discard 'candidate' and return with 'current_best'
+        //
+
+        let level = identify_level(layers.clone(), Lsn(0x10_000), max_size)
+            .await?
+            .unwrap();
+        assert_eq!(level.lsn_range, Lsn(0x9_000)..Lsn(0x10_000));
+        assert_eq!(level.layers.len(), 0);
        Ok(())
    }