WIP: change duplicate_tenant.py script to use the pagectl command

The script works but at restart, we detach the created tenants because
they're not known to the attachment service:

  Detaching tenant, control plane omitted it in re-attach response tenant_id=1e399d390e3aee6b11c701cbc716bb6c

=> figure out how to further integrate this
This commit is contained in:
Christian Schwarz
2023-11-08 20:16:33 +00:00
committed by Christian Schwarz
parent c6d8c7cb0a
commit 9f72d063f3
13 changed files with 109 additions and 48 deletions

View File

@@ -1,17 +1,3 @@
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
#
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
#
[profile.dev.package."*"]
# Set the default for dependencies in Development mode.
opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1
[build]
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.

View File

@@ -371,6 +371,8 @@ pub struct TenantInfo {
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
pub attachment_status: TenantAttachmentStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -515,6 +517,8 @@ pub enum HistoricLayerInfo {
lsn_end: Lsn,
remote: bool,
access_stats: LayerAccessStats,
remote_path: Option<String>,
},
Image {
layer_file_name: String,
@@ -523,6 +527,8 @@ pub enum HistoricLayerInfo {
lsn_start: Lsn,
remote: bool,
access_stats: LayerAccessStats,
remote_path: Option<String>,
},
}
@@ -862,6 +868,7 @@ mod tests {
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
};
let expected_active = json!({
"id": original_active.id.to_string(),
@@ -882,6 +889,7 @@ mod tests {
},
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
};
let expected_broken = json!({
"id": original_broken.id.to_string(),

View File

@@ -81,6 +81,12 @@ impl std::fmt::Display for RemotePath {
}
}
impl From<RemotePath> for String {
fn from(val: RemotePath) -> Self {
val.0.into()
}
}
impl RemotePath {
pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
anyhow::ensure!(

View File

@@ -50,7 +50,9 @@ pub(crate) enum LayerCmd {
},
RewriteSummary {
layer_file_path: Utf8PathBuf,
#[clap(long)]
new_tenant_id: Option<TenantId>,
#[clap(long)]
new_timeline_id: Option<TimelineId>,
},
}
@@ -185,6 +187,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
macro_rules! rewrite_closure {

View File

@@ -261,7 +261,7 @@ async fn calculate_synthetic_size_worker(
}
};
for (tenant_id, tenant_state) in tenants {
for (tenant_id, tenant_state, _gen) in tenants {
if tenant_state != TenantState::Active {
continue;
}

View File

@@ -197,7 +197,7 @@ pub(super) async fn collect_all_metrics(
}
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
if state != TenantState::Active {
None
} else {

View File

@@ -541,7 +541,7 @@ async fn collect_eviction_candidates(
let mut candidates = Vec::new();
for (tenant_id, _state) in &tenants {
for (tenant_id, _state, _gen) in &tenants {
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}

View File

@@ -760,11 +760,12 @@ async fn tenant_list_handler(
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
})?
.iter()
.map(|(id, state)| TenantInfo {
.map(|(id, state, gen)| TenantInfo {
id: *id,
state: state.clone(),
current_physical_size: None,
attachment_status: state.attachment_status(),
generation: (*gen).into(),
})
.collect::<Vec<TenantInfo>>();
@@ -793,6 +794,7 @@ async fn tenant_status(
state: state.clone(),
current_physical_size: Some(current_physical_size),
attachment_status: state.attachment_status(),
generation: tenant.generation().into(),
})
}
.instrument(info_span!("tenant_status_handler", %tenant_id))

View File

@@ -1711,6 +1711,10 @@ impl Tenant {
self.current_state() == TenantState::Active
}
pub fn generation(&self) -> Generation {
self.generation
}
/// Changes tenant status to active, unless shutdown was already requested.
///
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup

View File

@@ -1294,7 +1294,8 @@ pub(crate) enum TenantMapListError {
///
/// Get list of tenants, for the mgmt API
///
pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
pub(crate) async fn list_tenants(
) -> Result<Vec<(TenantId, TenantState, Generation)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1302,7 +1303,9 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Attached(tenant) => {
Some((*id, tenant.current_state(), tenant.generation()))
}
TenantSlot::Secondary => None,
TenantSlot::InProgress(_) => None,
})

View File

@@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use remote_storage::RemotePath;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
@@ -304,6 +305,11 @@ impl Layer {
&self.0.path
}
/// This can return None even though it should return Some in some edge cases.
pub(crate) fn remote_path(&self) -> Option<RemotePath> {
self.0.remote_path()
}
pub(crate) fn metadata(&self) -> LayerFileMetadata {
self.0.metadata()
}
@@ -914,6 +920,17 @@ impl LayerInner {
}
}
/// This can return None even though it should return Some in some edge cases.
fn remote_path(&self) -> Option<RemotePath> {
let tl = self.timeline.upgrade()?; // TODO: should distinguish this case, but, accuracy doesn't matter for this field.
Some(crate::tenant::remote_timeline_client::remote_layer_path(
&tl.tenant_id,
&tl.timeline_id,
&self.desc.filename(),
self.generation,
))
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.desc.filename().file_name();
@@ -933,6 +950,7 @@ impl LayerInner {
lsn_end: lsn_range.end,
remote,
access_stats,
remote_path: self.remote_path().map(|p| p.into()),
}
} else {
let lsn = self.desc.image_layer_lsn();
@@ -943,6 +961,7 @@ impl LayerInner {
lsn_start: lsn,
remote,
access_stats,
remote_path: self.remote_path().map(|p| p.into()),
}
}
}

View File

@@ -1,43 +1,69 @@
# Usage from top of repo:
# poetry run python3 test_runner/duplicate_tenant.py b97965931096047b2d54958756baee7b 10
from queue import Queue
import sys
import threading
# poetry run python3 ./test_runner/duplicate_tenant.py c66e2e233057f7f05563caff664ecb14 .neon/remote_storage_local_fs
from pathlib import Path
import shutil
import subprocess
import time
import requests
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.types import TenantId
import argparse
initial_tenant = sys.argv[1]
ncopies = int(sys.argv[2])
numthreads = int(sys.argv[3])
parser = argparse.ArgumentParser(description="Duplicate tenant script.")
parser.add_argument("initial_tenant", type=str, help="Initial tenant")
parser.add_argument("remote_storage_local_fs_root", type=Path, help="Remote storage local fs root")
parser.add_argument("--ncopies", type=int, help="Number of copies")
parser.add_argument("--numthreads", type=int, default=1, help="Number of threads")
parser.add_argument("--port", type=int, default=9898, help="Pageserver management api port")
args = parser.parse_args()
# class DuckTypedNeonEnv:
# pass
initial_tenant = args.initial_tenant
remote_storage_local_fs_root: Path = args.remote_storage_local_fs_root
ncopies = args.ncopies
numthreads = args.numthreads
new_tenant = TenantId.generate()
print(f"New tenant: {new_tenant}")
# cli = NeonCli(DuckTypedNeonEnv())
client = PageserverHttpClient(args.port, lambda: None)
q = Queue()
for i in range(0, ncopies):
q.put(i)
src_tenant_gen = int(client.tenant_status(initial_tenant)["generation"])
for i in range(0, numthreads):
q.put(None)
assert remote_storage_local_fs_root.is_dir(), f"{remote_storage_local_fs_root} is not a directory"
src_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / initial_tenant / "timelines"
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
def create():
while True:
if q.get() == None:
break
new_tenant = TenantId.generate()
res = requests.post(
f"http://localhost:9898/v1/tenant/{initial_tenant}/duplicate",
json={"new_tenant_id": str(new_tenant)},
)
res.raise_for_status()
dst_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / str(new_tenant) / "timelines"
dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
dst_timelines_dir.mkdir(parents=False, exist_ok=False)
for tl in src_timelines_dir.iterdir():
src_tl_dir = src_timelines_dir / tl.name
assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
dst_tl_dir = dst_timelines_dir / tl.name
dst_tl_dir.mkdir(parents=False, exist_ok=False)
for file in tl.iterdir():
shutil.copy2(file, dst_tl_dir)
if "__" in file.name:
cmd = [
"./target/debug/pagectl", # TODO: abstract this like the other binaries
"layer",
"rewrite-summary",
str(dst_tl_dir / file.name),
"--new-tenant-id",
str(new_tenant),
]
subprocess.run(cmd, check=True)
for i in range(0, numthreads):
threading.Thread(target=create).start()
client.tenant_attach(new_tenant, generation=src_tenant_gen)
while True:
status = client.tenant_status(new_tenant)
if status["state"]["slug"] == "Active":
break
print("Waiting for tenant to be active..., is: " + status["state"]["slug"])
time.sleep(1)
print("Tenant is active: " + str(new_tenant))

View File

@@ -58,6 +58,7 @@ class HistoricLayerInfo:
lsn_start: str
lsn_end: Optional[str]
remote: bool
remote_path: Optional[str] = None
@classmethod
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
@@ -68,6 +69,7 @@ class HistoricLayerInfo:
lsn_start=d["lsn_start"],
lsn_end=d.get("lsn_end"),
remote=d["remote"],
remote_path=d.get("remote_path"),
)