Compare commits

...

26 Commits

Author SHA1 Message Date
Alex Chi Z
7b4251e9b5 use Tenant::shutdown
Signed-off-by: Alex Chi Z <chi@neon.tech>
2023-07-21 16:18:00 -04:00
Alex Chi Z
5f8bd41725 address comments
Signed-off-by: Alex Chi Z <chi@neon.tech>
2023-07-21 16:12:03 -04:00
Alex Chi Z
1b78f7cb4f Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/rm-file-if-fail 2023-07-21 16:09:19 -04:00
Christian Schwarz
041ed130df aesthetic refactoring 2023-07-10 17:05:14 +02:00
Alex Chi Z
cb02f0827a adapt to async tenant operations
Signed-off-by: Alex Chi Z <chi@neon.tech>
2023-07-07 14:53:16 -04:00
Alex Chi Z
18aa5f31a3 Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/rm-file-if-fail 2023-07-07 13:56:46 -04:00
Shany Pozin
14d5f76139 add block_on as set_broken became async 2023-07-02 15:22:58 +03:00
Shany Pozin
bd64c52fcc format 2023-07-02 14:43:35 +03:00
Shany Pozin
41211da2ad Merge branch 'main' into skyzh/rm-file-if-fail 2023-07-02 14:39:21 +03:00
Alex Chi
6942d3f5dc Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/rm-file-if-fail 2023-05-26 14:01:37 -04:00
Alex Chi
240da9f4c8 fix ce
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-23 15:22:55 -04:00
Alex Chi
5362687168 Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/rm-file-if-fail 2023-05-23 15:22:09 -04:00
Alex Chi
21ea1a2a36 resolve comments
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-23 12:01:47 -04:00
Alex Chi
56633521d3 fix fmt
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-23 11:44:58 -04:00
Alex Chi
33d0627930 update comment
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-23 11:30:23 -04:00
Alex Chi
f331b58bb3 trigger ci
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-23 09:25:36 -04:00
Alex Chi
ee2c1a3ff4 apply comments
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-22 14:31:45 -04:00
Alex Chi
b17aa6b45b use guard for stop tenant
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-17 09:27:20 -04:00
Alex Chi
6048a41981 apply comments
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 17:59:36 -04:00
Alex Chi Z
52e8c3c98f Update test_runner/regress/test_tenants.py
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2023-05-16 17:44:16 -04:00
Alex Chi Z
38602ef580 Update test_runner/regress/test_tenants.py
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2023-05-16 17:44:04 -04:00
Alex Chi
a93cd31554 fix typecheck
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 16:10:56 -04:00
Alex Chi
eab349839c fix python style
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 16:06:31 -04:00
Alex Chi
2e9ca129bc fix test cases
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 16:02:18 -04:00
Alex Chi
1d47833304 add test cases
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 15:32:56 -04:00
Alex Chi
964c672c61 remove tenant directory if fails to create
Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-05-16 15:22:48 -04:00
2 changed files with 132 additions and 31 deletions

View File

@@ -1,6 +1,8 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use futures::Future;
use scopeguard::ScopeGuard;
use std::collections::{hash_map, HashMap};
use std::ffi::OsStr;
use std::path::Path;
@@ -14,7 +16,7 @@ use tokio::task::JoinSet;
use tracing::*;
use remote_storage::GenericRemoteStorage;
use utils::crashsafe;
use utils::{completion, crashsafe};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -339,24 +341,61 @@ pub async fn create_tenant(
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
tenant_map_insert(tenant_id, async {
// We're holding the tenants lock in write mode while doing local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
// and do the work in that state.
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_directory = super::create_tenant_files(
conf,
tenant_conf,
&tenant_id,
CreateTenantFilesMode::Create,
)?;
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
let guard_fs = scopeguard::guard((), |_| {
if let Err(e) = std::fs::remove_dir_all(&tenant_directory) {
// log the error but not throwing it somewhere
warn!("failed to cleanup when tenant {tenant_id} fails to start: {e}");
}
});
let crated_tenant_id = created_tenant.tenant_id();
anyhow::ensure!(
let created_tenant = schedule_local_tenant_processing(
conf,
&tenant_directory,
broker_client,
remote_storage,
None,
ctx,
)?;
// Put all code that might error into the check function.
let check = || {
fail::fail_point!("tenant-create-fail", |_| {
anyhow::bail!("failpoint: tenant-create-fail");
});
let crated_tenant_id = created_tenant.tenant_id();
anyhow::ensure!(
tenant_id == crated_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
);
Ok(())
};
if let Err(e) = check() {
// `schedule_local_tenant_processing` eventually launches the tenant's background task
// We need to shut them down before bailing out. Shutdown the tenant in the background.
tokio::spawn(async move {
let (_guard, shutdown_progress) = completion::channel();
created_tenant.shutdown(shutdown_progress, false).await
});
return Err(e);
}
// Ok, we're good. Disarm the cleanup scopeguards and return the tenant.
ScopeGuard::into_inner(guard_fs);
Ok(created_tenant)
}).await
}
@@ -493,7 +532,7 @@ pub async fn load_tenant(
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
tenant_map_insert(tenant_id, async {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
if tenant_ignore_mark.exists() {
@@ -570,30 +609,61 @@ pub async fn attach_tenant(
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
tenant_map_insert(tenant_id, async {
let tenant_dir =
create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
let guard_fs = scopeguard::guard((), |_| {
if let Err(e) = std::fs::remove_dir_all(&tenant_dir) {
// log the error but not throwing it somewhere
warn!("failed to cleanup when tenant {tenant_id} fails to start: {e}");
}
});
// Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
let marker_file_exists = conf
.tenant_attaching_mark_file_path(&tenant_id)
.try_exists()
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
let attached_tenant_id = attached_tenant.tenant_id();
anyhow::ensure!(
tenant_id == attached_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
marker_file_exists,
"create_tenant_files should have created the attach marker file"
);
let attached_tenant = schedule_local_tenant_processing(
conf,
&tenant_dir,
broker_client,
Some(remote_storage),
None,
ctx,
)?;
// Put all code that might error in the check function.
let check = || {
let attached_tenant_id = attached_tenant.tenant_id();
anyhow::ensure!(
tenant_id == attached_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
);
Ok(())
};
if let Err(e) = check() {
// `schedule_local_tenant_processing` eventually launches the tenant's background task
// We need to shut them down before bailing out. Shutdown the tenant in the background.
tokio::spawn(async move {
let (_guard, shutdown_progress) = completion::channel();
attached_tenant.shutdown(shutdown_progress, false).await
});
return Err(e);
}
// Ok, we're good. Disarm the cleanup scopeguards and return the tenant.
ScopeGuard::into_inner(guard_fs);
Ok(attached_tenant)
})
.await?;
}).await?;
Ok(())
}
@@ -615,12 +685,12 @@ pub enum TenantMapInsertError {
///
/// NB: the closure should return quickly because the current implementation of tenants map
/// serializes access through an `RwLock`.
async fn tenant_map_insert<F>(
async fn tenant_map_insert<Fut>(
tenant_id: TenantId,
insert_fn: F,
insert_fut: Fut,
) -> Result<Arc<Tenant>, TenantMapInsertError>
where
F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
Fut: Future<Output = anyhow::Result<Arc<Tenant>>>,
{
let mut guard = TENANTS.write().await;
let m = match &mut *guard {
@@ -633,7 +703,7 @@ where
tenant_id,
e.get().current_state(),
)),
hash_map::Entry::Vacant(v) => match insert_fn() {
hash_map::Entry::Vacant(v) => match insert_fut.await {
Ok(tenant) => {
v.insert(tenant.clone());
Ok(tenant)

View File

@@ -21,6 +21,7 @@ from fixtures.neon_fixtures import (
RemoteStorageKind,
available_remote_storages,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -406,3 +407,33 @@ def test_pageserver_with_empty_tenants(
assert (
tenant_broken_count == 1
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_failed_tenant_directory_is_removed(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""Tenants which fail to be created are cleaned up from disk and not created"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_pageserver_create_tenants_fail",
)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*tenant-create-fail.*")
env.pageserver.allowed_errors.append(".*tenant-attach-fail.*")
env.pageserver.allowed_errors.append(".*Tenant is already in Broken state.*")
env.pageserver.allowed_errors.append(".*could not load tenant.*")
env.pageserver.allowed_errors.append(".*InternalServerError.*")
client = env.pageserver.http_client()
client.configure_failpoints(("tenant-create-fail", "return"))
tenant_id = TenantId.generate()
with pytest.raises(PageserverApiException, match="failpoint: tenant-create-fail"):
client.tenant_create(tenant_id)
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
with pytest.raises(PageserverApiException, match="Tenant .* not found"):
# the tenant creation is not successful and should not be found
client.tenant_status(tenant_id)