From 2c49b72c79230eba6bc1360fb82384f737f1d311 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 15 Dec 2023 17:48:33 +0000 Subject: [PATCH] find a way to duplicate a tenant in local_fs Use the script like so, against the tenant to duplicate: poetry run python3 ./test_runner/duplicate_tenant.py 7ea51af32d42bfe7fb93bf5f28114d09 200 8 backup of pageserver.toml d =1 pg_distrib_dir ='/home/admin/neon-main/pg_install' http_auth_type ='Trust' pg_auth_type ='Trust' listen_http_addr ='127.0.0.1:9898' listen_pg_addr ='127.0.0.1:64000' broker_endpoint ='http://127.0.0.1:50051/' #control_plane_api ='http://127.0.0.1:1234/' # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '127.0.0.1:64000' #listen_http_addr = '127.0.0.1:9898' #wait_lsn_timeout = '60 s' #wal_redo_timeout = '60 s' #max_file_descriptors = 10000 #page_cache_size = 160000 # initial superuser role name to use when creating a new tenant #initial_superuser_name = 'cloud_admin' #broker_endpoint = 'http://127.0.0.1:50051' #log_format = 'plain' #concurrent_tenant_size_logical_size_queries = '1' #metric_collection_interval = '10 min' #cached_metric_collection_interval = '0s' #synthetic_size_calculation_interval = '10 min' #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"} #background_task_maximum_delay = '10s' [tenant_config] #checkpoint_distance = 268435456 # in bytes #checkpoint_timeout = 10 m #compaction_target_size = 134217728 # in bytes #compaction_period = '20 s' #compaction_threshold = 10 #gc_period = '1 hr' #gc_horizon = 67108864 #image_creation_threshold = 3 #pitr_interval = '7 days' #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '24 hour' #gc_feedback = false # make it determinsitic gc_period = '0s' checkpoint_timeout = '3650 day' compaction_period = '20 s' compaction_threshold = 10 compaction_target_size = 134217728 checkpoint_distance = 268435456 image_creation_threshold = 3 [remote_storage] local_path = '/home/admin/neon-main/bench_repo_dir/repo/remote_storage_local_fs' remove http handler switch to generalized rewrite_summary & impl page_ctl subcommand to use it WIP: change duplicate_tenant.py script to use the pagectl command The script works but at restart, we detach the created tenants because they're not known to the attachment service: Detaching tenant, control plane omitted it in re-attach response tenant_id=1e399d390e3aee6b11c701cbc716bb6c => figure out how to further integrate this --- libs/pageserver_api/src/models.rs | 4 ++ libs/remote_storage/src/lib.rs | 8 ++- pageserver/src/consumption_metrics.rs | 2 +- pageserver/src/consumption_metrics/metrics.rs | 2 +- pageserver/src/deletion_queue.rs | 2 +- pageserver/src/disk_usage_eviction_task.rs | 2 +- pageserver/src/http/routes.rs | 4 +- pageserver/src/tenant.rs | 4 ++ pageserver/src/tenant/mgr.rs | 8 ++- test_runner/duplicate_tenant.py | 69 +++++++++++++++++++ test_runner/fixtures/pageserver/http.py | 2 + 11 files changed, 98 insertions(+), 9 deletions(-) create mode 100644 test_runner/duplicate_tenant.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index c5d8ca489d..506660937f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -368,6 +368,8 @@ pub struct TenantInfo { /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -885,6 +887,7 @@ mod tests { state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, + generation: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -905,6 +908,7 @@ mod tests { }, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, + generation: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e77c54e1e7..f568e470cb 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -83,6 +83,12 @@ impl std::fmt::Display for RemotePath { } } +impl From for String { + fn from(val: RemotePath) -> Self { + val.0.into() + } +} + impl RemotePath { pub fn new(relative_path: &Utf8Path) -> anyhow::Result { anyhow::ensure!( @@ -104,7 +110,7 @@ impl RemotePath { self.0.file_name() } - pub fn join(&self, segment: &Utf8Path) -> Self { + pub fn join>(&self, segment: P) -> Self { Self(self.0.join(segment)) } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index bde2cedca7..012a950b60 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker( } }; - for (tenant_shard_id, tenant_state) in tenants { + for (tenant_shard_id, tenant_state, _gen) in tenants { if tenant_state != TenantState::Active { continue; } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 0b827816bc..26b299a71d 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics( } }; - let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { + let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { if state != TenantState::Active || !id.is_zero() { None } else { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 7b05745483..e89ff5ff68 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -312,7 +312,7 @@ impl DeletionList { result.extend( timeline_layers .into_iter() - .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))), + .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))), ); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 76906cfaf7..d6d68f1657 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -515,7 +515,7 @@ async fn collect_eviction_candidates( let mut candidates = Vec::new(); - for (tenant_id, _state) in &tenants { + for (tenant_id, _state, _gen) in &tenants { if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dad27759ee..c44b4d1742 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -845,11 +845,12 @@ async fn tenant_list_handler( ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state)| TenantInfo { + .map(|(id, state, gen)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), + generation: (*gen).into(), }) .collect::>(); @@ -879,6 +880,7 @@ async fn tenant_status( state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), + generation: tenant.generation().into(), }, timelines: tenant.list_timeline_ids(), }) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1d6f1001db..5a3896a9f7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1915,6 +1915,10 @@ impl Tenant { self.current_state() == TenantState::Active } + pub fn generation(&self) -> Generation { + self.generation + } + /// Changes tenant status to active, unless shutdown was already requested. /// /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b2f14db9f7..e51188d16b 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1511,8 +1511,8 @@ pub(crate) enum TenantMapListError { /// /// Get list of tenants, for the mgmt API /// -pub(crate) async fn list_tenants() -> Result, TenantMapListError> -{ +pub(crate) async fn list_tenants( +) -> Result, TenantMapListError> { let tenants = TENANTS.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), @@ -1520,7 +1520,9 @@ pub(crate) async fn list_tenants() -> Result, }; Ok(m.iter() .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), + TenantSlot::Attached(tenant) => { + Some((*id, tenant.current_state(), tenant.generation())) + } TenantSlot::Secondary => None, TenantSlot::InProgress(_) => None, }) diff --git a/test_runner/duplicate_tenant.py b/test_runner/duplicate_tenant.py new file mode 100644 index 0000000000..f4943f5bfe --- /dev/null +++ b/test_runner/duplicate_tenant.py @@ -0,0 +1,69 @@ +# Usage from top of repo: +# poetry run python3 ./test_runner/duplicate_tenant.py c66e2e233057f7f05563caff664ecb14 .neon/remote_storage_local_fs +import argparse +import shutil +import subprocess +import time +from pathlib import Path + +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.types import TenantId + +parser = argparse.ArgumentParser(description="Duplicate tenant script.") +parser.add_argument("initial_tenant", type=str, help="Initial tenant") +parser.add_argument("remote_storage_local_fs_root", type=Path, help="Remote storage local fs root") +parser.add_argument("--ncopies", type=int, help="Number of copies") +parser.add_argument("--numthreads", type=int, default=1, help="Number of threads") +parser.add_argument("--port", type=int, default=9898, help="Pageserver management api port") + +args = parser.parse_args() + +initial_tenant = args.initial_tenant +remote_storage_local_fs_root: Path = args.remote_storage_local_fs_root +ncopies = args.ncopies +numthreads = args.numthreads + +new_tenant = TenantId.generate() +print(f"New tenant: {new_tenant}") + +client = PageserverHttpClient(args.port, lambda: None) + +src_tenant_gen = int(client.tenant_status(initial_tenant)["generation"]) + +assert remote_storage_local_fs_root.is_dir(), f"{remote_storage_local_fs_root} is not a directory" + +src_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / initial_tenant / "timelines" +assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory" + +dst_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / str(new_tenant) / "timelines" +dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False) +dst_timelines_dir.mkdir(parents=False, exist_ok=False) + +for tl in src_timelines_dir.iterdir(): + src_tl_dir = src_timelines_dir / tl.name + assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory" + dst_tl_dir = dst_timelines_dir / tl.name + dst_tl_dir.mkdir(parents=False, exist_ok=False) + for file in tl.iterdir(): + shutil.copy2(file, dst_tl_dir) + if "__" in file.name: + cmd = [ + "./target/debug/pagectl", # TODO: abstract this like the other binaries + "layer", + "rewrite-summary", + str(dst_tl_dir / file.name), + "--new-tenant-id", + str(new_tenant), + ] + subprocess.run(cmd, check=True) + +client.tenant_attach(new_tenant, generation=src_tenant_gen) + +while True: + status = client.tenant_status(new_tenant) + if status["state"]["slug"] == "Active": + break + print("Waiting for tenant to be active..., is: " + status["state"]["slug"]) + time.sleep(1) + +print("Tenant is active: " + str(new_tenant)) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index eda8813c36..bfd70888e6 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -58,6 +58,7 @@ class HistoricLayerInfo: lsn_start: str lsn_end: Optional[str] remote: bool + remote_path: Optional[str] = None @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: @@ -68,6 +69,7 @@ class HistoricLayerInfo: lsn_start=d["lsn_start"], lsn_end=d.get("lsn_end"), remote=d["remote"], + remote_path=d.get("remote_path"), )