create the test case to reproduce the issue

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z
2024-11-18 15:57:53 -05:00
parent ada84400b7
commit 45f6111ad9
3 changed files with 45 additions and 18 deletions

View File

@@ -10,6 +10,7 @@ use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::TimeoutOrCancel;
use remote_storage::MAX_KEYS_PER_DELETE;
use utils::pausable_failpoint;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
@@ -90,6 +91,7 @@ impl Deleter {
/// Block until everything in accumulator has been executed
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
pausable_failpoint!("deletion-queue-before-execute-pause");
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some

View File

@@ -2608,7 +2608,9 @@ impl Timeline {
// See https://github.com/neondatabase/neon/issues/5878
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
// (1) and (4) ONLY IF generation number gets bumped. There are some cases where
// we load a tenant without bumping the generation number (i.e., detach ancestor
// and timeline offload/un-offload). In those cases, we need to rely on the barrier.
self.remote_client.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import time
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
@@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import query_scalar, wait_until
def test_issue_5878(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"attach_mode",
["default_generation", "same_generation"],
)
def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
"""
Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
@@ -168,11 +173,34 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
tenant_conf = ps_http.tenant_config(tenant_id)
generation_before_detach = get_generation_number()
env.pageserver.tenant_detach(tenant_id)
failpoint_name = "before-delete-layer-pausable"
failpoint_deletion_queue = "deletion-queue-before-execute-pause"
failpoint_upload_queue = "before-delete-layer-pausable"
ps_http.configure_failpoints((failpoint_name, "pause"))
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
generation_after_reattach = get_generation_number()
ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
ps_http.configure_failpoints((failpoint_upload_queue, "off"))
if attach_mode == "default_generation":
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
elif attach_mode == "same_generation":
# Attach with the same generation number -- this is possible with timeline offload and detach ancestor
env.pageserver.tenant_attach(
tenant_id,
tenant_conf.tenant_specific_overrides,
generation=generation_before_detach,
# We want to avoid the generation bump and don't want to talk with the storcon
override_storage_controller_generation=False,
)
else:
raise AssertionError(f"Unknown attach_mode: {attach_mode}")
# Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon
# API if attach_mode == "same_generation"
tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id)
generation_after_reattach = tenant_location["generation"]
if attach_mode == "same_generation":
# The generation number should be the same as before the detach
assert generation_before_detach == generation_after_reattach
wait_until_tenant_active(ps_http, tenant_id)
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
@@ -182,15 +210,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
wait_until(10, 0.5, future_layer_is_gone_from_index_part)
# NB: the layer file is unlinked index part now, but, because we made the delete
# operation stuck, the layer file itself is still in the remote_storage
wait_until(
10,
0.5,
lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
),
)
# We already make deletion stuck here, but we don't necessarily hit the failpoint
# because deletions are batched.
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
)
@@ -224,11 +245,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
break
time.sleep(1)
# Window has passed, unstuck the delete, let upload queue drain.
# Window has passed, unstuck the delete, let deletion queue drain; the upload queue should
# have drained because we put these layer deletion operations into the deletion queue and
# have consumed the operation from the upload queue.
log.info("unstuck the DELETE")
ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
ps_http.configure_failpoints((failpoint_deletion_queue, "off"))
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
env.pageserver.http_client().deletion_queue_flush(True)
# Examine the resulting S3 state.
log.info("integrity-check the remote storage")