fix: cleanup of layers from the future can race with their re-creation (#5890)

fixes https://github.com/neondatabase/neon/issues/5878 obsoletes https://github.com/neondatabase/neon/issues/5879 Before this PR, it could happen that `load_layer_map` schedules removal of the future image layer. Then a later compaction run could re-create the same image layer, scheduling a PUT. Due to lack of an upload queue barrier, the PUT and DELETE could be re-ordered. The result was IndexPart referencing a non-existent object. ## Summary of changes * Add support to `pagectl` / Python tests to decode `IndexPart` * Rust * new `pagectl` Subcommand * `IndexPart::{from,to}_s3_bytes()` methods to internalize knowledge about encoding of `IndexPart` * Python * new `NeonCli` subclass * Add regression test * Rust * Ability to force repartitioning; required to ensure image layer creation at last_record_lsn * Python * The regression test. * Fix the issue * Insert an `UploadOp::Barrier` after scheduling the deletions.
2026-01-06 13:02:55 +00:00 · 2023-11-23 14:33:41 +01:00
parent 6afbadc90e
commit a0e61145c8
18 changed files with 615 additions and 56 deletions
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -0,0 +1,222 @@
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.types import (
+    DeltaLayerFileName,
+    ImageLayerFileName,
+    is_future_layer,
+)
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload_queue_empty,
+    wait_until_tenant_active,
+)
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar, wait_until
+
+
+def test_issue_5878(neon_env_builder: NeonEnvBuilder):
+    """
+    Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
+
+    Create a situation where IndexPart contains an image layer from a future
+    (i.e., image layer > IndexPart::disk_consistent_lsn).
+    Detach.
+    Attach.
+    Wait for tenant to finish load_layer_map (by waiting for it to become active).
+    Wait for any remote timeline client ops to finish that the attach started.
+    Integrity-check the index part.
+
+    Before fixing the issue, load_layer_map would schedule removal of the future
+    image layer. A compaction run could later re-create the image layer with
+    the same file name, scheduling a PUT.
+    Due to lack of an upload queue barrier, the PUT and DELETE could be re-ordered.
+    The result was IndexPart referencing a non-existent object.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+
+    l0_l1_threshold = 3
+    image_creation_threshold = 1
+
+    tenant_config = {
+        "gc_period": "0s",  # disable GC (shouldn't matter for this test but still)
+        "compaction_period": "0s",  # we want to control when compaction runs
+        "checkpoint_timeout": "24h",  # something we won't reach
+        "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
+        "image_creation_threshold": f"{image_creation_threshold}",
+        "compaction_threshold": f"{l0_l1_threshold}",
+        "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
+    }
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+
+    def get_index_part():
+        assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+        ip_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
+        return env.pagectl.dump_index_part(ip_path)
+
+    def get_future_layers():
+        ip = get_index_part()
+        future_layers = [
+            layer_file_name
+            for layer_file_name in ip.layer_metadata.keys()
+            if is_future_layer(layer_file_name, ip.disk_consistent_lsn)
+        ]
+        return future_layers
+
+    assert len(get_future_layers()) == 0
+
+    current = get_index_part()
+    assert len(set(current.layer_metadata.keys())) == 1
+    layer_file_name = list(current.layer_metadata.keys())[0]
+    assert isinstance(layer_file_name, DeltaLayerFileName)
+    assert layer_file_name.is_l0(), f"{layer_file_name}"
+
+    log.info("force image layer creation in the future by writing some data into in-memory layer")
+
+    # Create a number of layers in the tenant
+    with endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE foo (t text)")
+        iters = l0_l1_threshold * image_creation_threshold
+        for i in range(0, iters):
+            cur.execute(
+                f"""
+                INSERT INTO foo
+                SELECT '{i}' || g
+                FROM generate_series(1, 10000) g
+                """
+            )
+            last_record_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+            wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
+            # 0..iters-1: create a stack of delta layers
+            # iters: leave a non-empty in-memory layer which we'll use for image layer generation
+            if i < iters - 1:
+                ps_http.timeline_checkpoint(tenant_id, timeline_id, force_repartition=True)
+                assert (
+                    len(
+                        [
+                            layer
+                            for layer in ps_http.layer_map_info(
+                                tenant_id, timeline_id
+                            ).historic_layers
+                            if layer.kind == "Image"
+                        ]
+                    )
+                    == 0
+                )
+
+    endpoint.stop()
+
+    wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+
+    ip = get_index_part()
+    assert len(ip.layer_metadata.keys())
+    assert (
+        ip.disk_consistent_lsn < last_record_lsn
+    ), "sanity check for what above loop is supposed to do"
+
+    # create the image layer from the future
+    ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
+    assert (
+        len(
+            [
+                layer
+                for layer in ps_http.layer_map_info(tenant_id, timeline_id).historic_layers
+                if layer.kind == "Image"
+            ]
+        )
+        == 1
+    )
+    wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+    future_layers = get_future_layers()
+    assert len(future_layers) == 1
+    future_layer = future_layers[0]
+    assert isinstance(future_layer, ImageLayerFileName)
+    assert future_layer.lsn == last_record_lsn
+    log.info(
+        f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
+    )
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    future_layer_path = env.pageserver_remote_storage.layer_path(
+        tenant_id, timeline_id, future_layer
+    )
+    log.info(f"future layer path: {future_layer_path}")
+    pre_stat = future_layer_path.stat()
+    time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
+
+    # force removal of layers from the future
+    tenant_conf = ps_http.tenant_config(tenant_id)
+    ps_http.tenant_detach(tenant_id)
+    failpoint_name = "before-delete-layer-pausable"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+    ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
+    wait_until_tenant_active(ps_http, tenant_id)
+
+    # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
+    def future_layer_is_gone_from_index_part():
+        future_layers = set(get_future_layers())
+        assert future_layer not in future_layers
+
+    wait_until(10, 0.5, future_layer_is_gone_from_index_part)
+
+    # NB: the layer file is unlinked index part now, but, because we made the delete
+    # operation stuck, the layer file itself is still in the remote_storage
+    def delete_at_pause_point():
+        assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
+
+    wait_until(10, 0.5, delete_at_pause_point)
+    assert future_layer_path.exists()
+
+    # wait for re-ingestion of the WAL from safekeepers into the in-memory layer
+    # (this happens in parallel to the above)
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
+
+    # re-do image layer generation
+    # This will produce the same image layer and queue an upload.
+    # However, we still have the deletion for the layer queued, stuck on the failpoint.
+    # An incorrect implementation would let the PUT execute before the DELETE.
+    # The later code in this test asserts that this doesn't happen.
+    ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
+
+    # Let things sit for some time; a good implementation makes no progress because
+    # we can't execute the PUT before the DELETE. A bad implementation would do that.
+    max_race_opportunity_window = 4
+    start = time.monotonic()
+    while True:
+        post_stat = future_layer_path.stat()
+        assert (
+            pre_stat.st_mtime == post_stat.st_mtime
+        ), "observed PUT overtake the stucked DELETE => bug isn't fixed yet"
+        if time.monotonic() - start > max_race_opportunity_window:
+            log.info(
+                "a correct implementation would never let the later PUT overtake the earlier DELETE"
+            )
+            break
+        time.sleep(1)
+
+    # Window has passed, unstuck the delete, let upload queue drain.
+    log.info("unstuck the DELETE")
+    ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
+
+    wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+
+    # Examine the resulting S3 state.
+    log.info("integrity-check the remote storage")
+    ip = get_index_part()
+    for layer_file_name in ip.layer_metadata.keys():
+        layer_path = env.pageserver_remote_storage.layer_path(
+            tenant_id, timeline_id, layer_file_name
+        )
+        assert layer_path.exists(), f"{layer_file_name.to_str()}"
+
+    log.info("assert that the overwritten layer won")
+    final_stat = future_layer_path.stat()
+    assert final_stat.st_mtime != pre_stat.st_mtime