fix(pageserver): ensure upload happens after delete (#9844)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/9682, that patch
didn't fully address the problem: what if shutdown fails due to whatever
reason and then we reattach the tenant? Then we will still remove the
future layer. The underlying problem is that the fix for #5878 gets
voided because of the generation optimizations.

Of course, we also need to ensure that delete happens after uploads, but
note that we only schedule deletes when there are no ongoing upload
tasks, so that's fine.

## Summary of changes

* Add a test case to reproduce the behavior (by changing the original
test case to attach the same generation).
* If layer upload happens after the deletion, drain the deletion queue
before uploading.
* If blocked_deletion is enabled, directly remove it from the
blocked_deletion queue.
* Local fs backend fix to avoid race between deletion and preload.
* test_emergency_mode does not need to wait for uploads (and it's
generally not possible to wait for uploads).
* ~~Optimize deletion executor to skip validation if there are no files
to delete.~~ this doesn't work

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2024-11-22 13:30:53 -05:00
committed by GitHub
parent 6f8b1eb5a6
commit c1937d073f
9 changed files with 184 additions and 42 deletions

View File

@@ -4942,6 +4942,7 @@ def last_flush_lsn_upload(
timeline_id: TimelineId,
pageserver_id: int | None = None,
auth_token: str | None = None,
wait_until_uploaded: bool = True,
) -> Lsn:
"""
Wait for pageserver to catch to the latest flush LSN of given endpoint,
@@ -4955,7 +4956,9 @@ def last_flush_lsn_upload(
for tenant_shard_id, pageserver in shards:
ps_http = pageserver.http_client(auth_token=auth_token)
wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
ps_http.timeline_checkpoint(
tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded
)
return last_flush_lsn
@@ -4980,6 +4983,7 @@ def generate_uploads_and_deletions(
timeline_id: TimelineId | None = None,
data: str | None = None,
pageserver: NeonPageserver,
wait_until_uploaded: bool = True,
):
"""
Using the environment's default tenant + timeline, generate a load pattern
@@ -5002,7 +5006,12 @@ def generate_uploads_and_deletions(
if init:
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
last_flush_lsn_upload(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
env,
endpoint,
tenant_id,
timeline_id,
pageserver_id=pageserver.id,
wait_until_uploaded=wait_until_uploaded,
)
def churn(data):
@@ -5025,7 +5034,12 @@ def generate_uploads_and_deletions(
# in a state where there are "future layers" in remote storage that will generate deletions
# after a restart.
last_flush_lsn_upload(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
env,
endpoint,
tenant_id,
timeline_id,
pageserver_id=pageserver.id,
wait_until_uploaded=wait_until_uploaded,
)
# Compaction should generate some GC-elegible layers
@@ -5041,4 +5055,4 @@ def generate_uploads_and_deletions(
# background ingest, no more uploads pending, and therefore no non-determinism
# in subsequent actions like pageserver restarts.
flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded)

View File

@@ -794,7 +794,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
if compact is not None:
query["compact"] = "true" if compact else "false"
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
log.info(
f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
)
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
params=query,

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import time
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
@@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import query_scalar, wait_until
def test_issue_5878(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"attach_mode",
["default_generation", "same_generation"],
)
def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
"""
Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
@@ -168,11 +173,32 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
tenant_conf = ps_http.tenant_config(tenant_id)
generation_before_detach = get_generation_number()
env.pageserver.tenant_detach(tenant_id)
failpoint_name = "before-delete-layer-pausable"
failpoint_deletion_queue = "deletion-queue-before-execute-pause"
ps_http.configure_failpoints((failpoint_name, "pause"))
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
generation_after_reattach = get_generation_number()
ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
if attach_mode == "default_generation":
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
elif attach_mode == "same_generation":
# Attach with the same generation number -- this is possible with timeline offload and detach ancestor
env.pageserver.tenant_attach(
tenant_id,
tenant_conf.tenant_specific_overrides,
generation=generation_before_detach,
# We want to avoid the generation bump and don't want to talk with the storcon
override_storage_controller_generation=False,
)
else:
raise AssertionError(f"Unknown attach_mode: {attach_mode}")
# Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon
# API if attach_mode == "same_generation"
tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id)
generation_after_reattach = tenant_location["generation"]
if attach_mode == "same_generation":
# The generation number should be the same as before the detach
assert generation_before_detach == generation_after_reattach
wait_until_tenant_active(ps_http, tenant_id)
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
@@ -182,15 +208,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
wait_until(10, 0.5, future_layer_is_gone_from_index_part)
# NB: the layer file is unlinked index part now, but, because we made the delete
# operation stuck, the layer file itself is still in the remote_storage
wait_until(
10,
0.5,
lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
),
)
# We already make deletion stuck here, but we don't necessarily hit the failpoint
# because deletions are batched.
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
)
@@ -224,11 +243,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
break
time.sleep(1)
# Window has passed, unstuck the delete, let upload queue drain.
# Window has passed, unstuck the delete, let deletion queue drain; the upload queue should
# have drained because we put these layer deletion operations into the deletion queue and
# have consumed the operation from the upload queue.
log.info("unstuck the DELETE")
ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
ps_http.configure_failpoints((failpoint_deletion_queue, "off"))
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
env.pageserver.http_client().deletion_queue_flush(True)
# Examine the resulting S3 state.
log.info("integrity-check the remote storage")
@@ -247,3 +268,12 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
final_stat = future_layer_path.stat()
log.info(f"future layer path: {future_layer_path}")
assert final_stat.st_mtime != pre_stat.st_mtime
# Ensure no weird errors in the end...
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
if attach_mode == "same_generation":
# we should have detected a race upload and deferred it
env.pageserver.assert_log_contains(
"waiting for deletion queue flush to complete before uploading layer"
)

View File

@@ -459,7 +459,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env.pageserver.start()
# The pageserver should provide service to clients
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
# Because it is in emergency mode, it will not attempt to validate deletions required by the initial barrier, and therefore
# other files cannot be uploaded b/c it's waiting for the initial barrier to be validated.
generate_uploads_and_deletions(
env, init=False, pageserver=env.pageserver, wait_until_uploaded=False
)
# The pageserver should neither validate nor execute any deletions, it should have
# loaded the DeletionLists from before though