fix(pageserver): ensure upload happens after delete (#9844)

## Problem Follow up of https://github.com/neondatabase/neon/pull/9682, that patch didn't fully address the problem: what if shutdown fails due to whatever reason and then we reattach the tenant? Then we will still remove the future layer. The underlying problem is that the fix for #5878 gets voided because of the generation optimizations. Of course, we also need to ensure that delete happens after uploads, but note that we only schedule deletes when there are no ongoing upload tasks, so that's fine. ## Summary of changes * Add a test case to reproduce the behavior (by changing the original test case to attach the same generation). * If layer upload happens after the deletion, drain the deletion queue before uploading. * If blocked_deletion is enabled, directly remove it from the blocked_deletion queue. * Local fs backend fix to avoid race between deletion and preload. * test_emergency_mode does not need to wait for uploads (and it's generally not possible to wait for uploads). * ~~Optimize deletion executor to skip validation if there are no files to delete.~~ this doesn't work --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-07-07 22:20:36 +00:00 · 2024-11-22 13:30:53 -05:00
parent 6f8b1eb5a6
commit c1937d073f
9 changed files with 184 additions and 42 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4942,6 +4942,7 @@ def last_flush_lsn_upload(
    timeline_id: TimelineId,
    pageserver_id: int | None = None,
    auth_token: str | None = None,
+    wait_until_uploaded: bool = True,
 ) -> Lsn:
    """
    Wait for pageserver to catch to the latest flush LSN of given endpoint,
@@ -4955,7 +4956,9 @@ def last_flush_lsn_upload(
    for tenant_shard_id, pageserver in shards:
        ps_http = pageserver.http_client(auth_token=auth_token)
        wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
-        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
+        ps_http.timeline_checkpoint(
+            tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded
+        )
    return last_flush_lsn


@@ -4980,6 +4983,7 @@ def generate_uploads_and_deletions(
    timeline_id: TimelineId | None = None,
    data: str | None = None,
    pageserver: NeonPageserver,
+    wait_until_uploaded: bool = True,
 ):
    """
    Using the environment's default tenant + timeline, generate a load pattern
@@ -5002,7 +5006,12 @@ def generate_uploads_and_deletions(
        if init:
            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+                env,
+                endpoint,
+                tenant_id,
+                timeline_id,
+                pageserver_id=pageserver.id,
+                wait_until_uploaded=wait_until_uploaded,
            )

        def churn(data):
@@ -5025,7 +5034,12 @@ def generate_uploads_and_deletions(
            # in a state where there are "future layers" in remote storage that will generate deletions
            # after a restart.
            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+                env,
+                endpoint,
+                tenant_id,
+                timeline_id,
+                pageserver_id=pageserver.id,
+                wait_until_uploaded=wait_until_uploaded,
            )

        # Compaction should generate some GC-elegible layers
@@ -5041,4 +5055,4 @@ def generate_uploads_and_deletions(
        # background ingest, no more uploads pending, and therefore no non-determinism
        # in subsequent actions like pageserver restarts.
        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded)
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -794,7 +794,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        if compact is not None:
            query["compact"] = "true" if compact else "false"

-        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
+        log.info(
+            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+        )
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
            params=query,