pageserver: fix race between deletion completion and incoming requests (#5941)

## Problem

This is a narrow race that can leave a stuck Stopping tenant behind,
while emitting a log error "Missing InProgress marker during tenant
upsert, this is a bug"

- Deletion request 1 puts tenant into Stopping state, and fires off
background part of DeleteTenantFlow
- Deletion request 2 acquires a SlotGuard for the same tenant ID, leaves
a TenantSlot::InProgress in place while it checks if the tenant's state
is accept able.
- DeleteTenantFlow finishes, calls TenantsMap::remove, which removes the
InProgress marker.
- Deletion request 2 calls SlotGuard::revert, which upserts the old
value (the Tenant in Stopping state), and emits the telltale log
message.

Closes: #5936 

## Summary of changes

- Add a regression test which uses pausable failpoints to reproduce this
scenario.
- TenantsMap::remove is only called by DeleteTenantFlow. Its behavior is
tweaked to express the different possible states, especially
`InProgress` which carriers a barrier.
- In DeleteTenantFlow, if we see such a barrier result from remove(),
wait for the barrier and then try removing again.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
John Spray
2023-11-29 09:32:26 +00:00
committed by GitHub
parent a15969714c
commit c48cc020bd
4 changed files with 184 additions and 14 deletions

View File

@@ -6,7 +6,7 @@ use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, warn, Instrument, Span};
use tracing::{error, instrument, Instrument, Span};
use utils::{
backoff, completion, crashsafe, fs_ext,
@@ -17,6 +17,7 @@ use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
InitializationOrder,
};
@@ -287,6 +288,8 @@ impl DeleteTenantFlow {
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
pausable_failpoint!("tenant-delete-before-run");
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
@@ -538,16 +541,68 @@ impl DeleteTenantFlow {
.context("cleanup_remaining_fs_traces")?;
{
let mut locked = tenants.write().unwrap();
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
pausable_failpoint!("tenant-delete-before-map-remove");
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
//
// This complexity will go away when we simplify how deletion works:
// https://github.com/neondatabase/neon/issues/5080
loop {
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
let barrier = {
let mut locked = tenants.write().unwrap();
let removed = locked.remove(&tenant.tenant_id);
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
match removed {
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
match tenant.current_state() {
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
// Expected: we put the tenant into stopping state before we start deleting it
}
state => {
// Unexpected state
tracing::warn!(
"Tenant in unexpected state {state} after deletion"
);
}
}
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
// This is unexpected: this secondary tenants should not have been created, and we
// are not in a position to shut it down from here.
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
}
TenantsMapRemoveResult::Vacant => {
tracing::warn!(
"Tenant removed from TenantsMap before deletion completed"
);
break;
}
TenantsMapRemoveResult::InProgress(barrier) => {
// An InProgress entry was found, we must wait on its barrier
barrier
}
}
};
tracing::info!(
"Waiting for competing operation to complete before deleting state for tenant"
);
barrier.wait().await;
}
}
*guard = Self::Finished;

View File

@@ -122,6 +122,12 @@ fn exactly_one_or_none<'a>(
}
}
pub(crate) enum TenantsMapRemoveResult {
Occupied(TenantSlot),
Vacant,
InProgress(utils::completion::Barrier),
}
impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
@@ -136,12 +142,28 @@ impl TenantsMap {
}
}
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
/// slot if the enclosed tenant is shutdown.
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
use std::collections::btree_map::Entry;
match self {
TenantsMap::Initializing => None,
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
key.and_then(|key| m.remove(&key))
match key {
Some(key) => match m.entry(key) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
None => TenantsMapRemoveResult::Vacant,
}
}
}
}

View File

@@ -263,6 +263,7 @@ class PageserverHttpClient(requests.Session):
def tenant_delete(self, tenant_id: TenantId):
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
self.verbose_error(res)
return res
def tenant_load(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")

View File

@@ -1,3 +1,4 @@
import concurrent.futures
import enum
import os
import shutil
@@ -474,4 +475,95 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
deletion.join()
# TODO test concurrent deletions with "hang" failpoint
def test_tenant_delete_concurrent(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
"""
Validate that concurrent delete requests to the same tenant behave correctly:
exactly one should succeed.
This is a reproducer for https://github.com/neondatabase/neon/issues/5936
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
ps_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Populate some data
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
# Errors logged from our 4xx requests
f".*{CONFLICT_MESSAGE}.*",
]
)
BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
# We will let the initial delete run until right before it would remove
# the tenant's TenantSlot. This pauses it in a state where the tenant
# is visible in Stopping state, and concurrent requests should fail with 4xx.
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
def delete_tenant():
return ps_http.tenant_delete(tenant_id)
def hit_remove_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
def hit_run_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
with concurrent.futures.ThreadPoolExecutor() as executor:
background_200_req = executor.submit(delete_tenant)
assert background_200_req.result(timeout=10).status_code == 202
# Wait until the first request completes its work and is blocked on removing
# the TenantSlot from tenant manager.
wait_until(100, 0.1, hit_remove_failpoint)
# Start another request: this should fail when it sees a tenant in Stopping state
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
ps_http.tenant_delete(tenant_id)
# Start another background request, which will pause after acquiring a TenantSlotGuard
# but before completing.
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
background_4xx_req = executor.submit(delete_tenant)
wait_until(100, 0.1, hit_run_failpoint)
# The TenantSlot is still present while the original request is hung before
# final removal
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
# Permit the original request to run to success
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
# Permit the duplicate background request to run to completion and fail.
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
background_4xx_req.result(timeout=10)
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0