From 231dfbaed630963e709166677908fff0b558e35e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 22:13:26 +0300 Subject: [PATCH] Do not remove empty timelines/ directory for tenants --- pageserver/src/tenant_mgr.rs | 44 ++++++++++++++++++----------- test_runner/regress/test_tenants.py | 37 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1efd3d4af4..0e8ee8c067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -108,6 +108,10 @@ pub fn init_tenant_mgr( /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. /// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// /// Attach happens on startup and sucessful timeline downloads /// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( @@ -173,16 +177,28 @@ fn load_local_tenant( remote_index.clone(), conf.remote_storage_config.is_some(), )); - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); + + let tenant_timelines_dir = conf.timelines_path(&tenant_id); + if !tenant_timelines_dir.is_dir() { + error!( + "Tenant {} has no timelines directory at {}", + tenant_id, + tenant_timelines_dir.display() + ); + tenant.set_state(TenantState::Broken); + } else { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } + tenant } @@ -630,14 +646,10 @@ fn collect_timelines_for_tenant( } if tenant_timelines.is_empty() { - match remove_if_empty(&timelines_dir) { - Ok(true) => info!( - "Removed empty tenant timelines directory {}", - timelines_dir.display() - ), - Ok(false) => (), - Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), - } + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); } Ok((tenant_id, tenant_timelines)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 52b9e6369c..ba5109a16f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,4 +1,5 @@ import os +import shutil from contextlib import closing from datetime import datetime from pathlib import Path @@ -201,3 +202,39 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) assert post_detach_samples == set() + + +def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + for timeline_dir_entry in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ): + if timeline_dir_entry.is_dir(): + shutil.rmtree(timeline_dir_entry) + else: + timeline_dir_entry.unlink() + + env.postgres.stop_all() + for _ in range(0, 3): + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 1 + ), "Pageserver should attach only tenants with empty timelines/ dir on restart" + loaded_tenant = tenants[0] + assert loaded_tenant["id"] == str( + tenant_with_empty_timelines_dir + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation"