Merge pull request #6158 from neondatabase/releases/2023-12-18

Release 2023-12-18
This commit is contained in:
Shany Pozin
2023-12-18 14:24:34 +02:00
committed by GitHub
93 changed files with 3725 additions and 1445 deletions

78
Cargo.lock generated
View File

@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -881,7 +881,7 @@ dependencies = [
"regex",
"rustc-hash",
"shlex",
"syn 2.0.28",
"syn 2.0.32",
"which",
]
@@ -1095,7 +1095,7 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -1245,16 +1245,19 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"camino",
"clap",
"comfy-table",
"compute_api",
"futures",
"git-version",
"hex",
"hyper",
"nix 0.26.2",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres",
"postgres_backend",
"postgres_connection",
@@ -1268,6 +1271,8 @@ dependencies = [
"tar",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-util",
"toml",
"tracing",
"url",
@@ -1481,7 +1486,7 @@ dependencies = [
"proc-macro2",
"quote",
"strsim",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -1492,7 +1497,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
dependencies = [
"darling_core",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -1567,7 +1572,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -1661,7 +1666,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -1915,7 +1920,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -2901,7 +2906,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -3103,6 +3108,7 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"md5",
"metrics",
"nix 0.26.2",
"num-traits",
@@ -3161,6 +3167,7 @@ dependencies = [
"enum-map",
"hex",
"postgres_ffi",
"rand 0.8.5",
"serde",
"serde_json",
"serde_with",
@@ -3171,6 +3178,19 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_client"
version = "0.1.0"
dependencies = [
"async-trait",
"pageserver_api",
"reqwest",
"serde",
"thiserror",
"utils",
"workspace_hack",
]
[[package]]
name = "parking"
version = "2.1.1"
@@ -3330,7 +3350,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -3537,7 +3557,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
dependencies = [
"proc-macro2",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -4145,7 +4165,7 @@ dependencies = [
"regex",
"relative-path",
"rustc_version",
"syn 2.0.28",
"syn 2.0.32",
"unicode-ident",
]
@@ -4291,6 +4311,7 @@ dependencies = [
"histogram",
"itertools",
"pageserver",
"pageserver_api",
"rand 0.8.5",
"remote_storage",
"reqwest",
@@ -4579,7 +4600,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -4660,7 +4681,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -4927,9 +4948,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.28"
version = "2.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
dependencies = [
"proc-macro2",
"quote",
@@ -5059,7 +5080,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -5177,7 +5198,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -5478,7 +5499,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]
@@ -5764,6 +5785,7 @@ dependencies = [
"serde",
"serde_assert",
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"strum",
@@ -5922,7 +5944,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
"wasm-bindgen-shared",
]
@@ -5956,7 +5978,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@@ -6293,7 +6315,7 @@ dependencies = [
"smallvec",
"subtle",
"syn 1.0.109",
"syn 2.0.28",
"syn 2.0.32",
"time",
"time-macros",
"tokio",
@@ -6355,22 +6377,22 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.7.3"
version = "0.7.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.3"
version = "0.7.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.32",
]
[[package]]

View File

@@ -5,6 +5,7 @@ members = [
"control_plane",
"pageserver",
"pageserver/ctl",
"pageserver/client",
"proxy",
"safekeeper",
"storage_broker",
@@ -182,6 +183,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }

View File

@@ -6,9 +6,11 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
git-version.workspace = true
nix.workspace = true
once_cell.workspace = true
@@ -24,10 +26,11 @@ tar.workspace = true
thiserror.workspace = true
toml.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
tokio-util.workspace = true
url.workspace = true
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
# instead, so that recompile times are better.
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_backend.workspace = true
safekeeper_api.workspace = true
postgres_connection.workspace = true

View File

@@ -9,7 +9,7 @@ pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: PathBuf,
client: reqwest::blocking::Client,
client: reqwest::Client,
}
const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
env: env.clone(),
path,
listen,
client: reqwest::blocking::ClientBuilder::new()
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
@@ -64,7 +64,7 @@ impl AttachmentService {
.expect("non-Unicode path")
}
pub fn start(&self) -> anyhow::Result<Child> {
pub async fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
background_process::start_process(
@@ -73,10 +73,11 @@ impl AttachmentService {
&self.env.attachment_service_bin(),
["-l", &self.listen, "-p", &path_str],
[],
background_process::InitialPidFile::Create(&self.pid_file()),
background_process::InitialPidFile::Create(self.pid_file()),
// TODO: a real status check
|| Ok(true),
|| async move { anyhow::Ok(true) },
)
.await
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -84,7 +85,7 @@ impl AttachmentService {
}
/// Call into the attach_hook API, for use before handing out attachments to pageservers
pub fn attach_hook(
pub async fn attach_hook(
&self,
tenant_id: TenantId,
pageserver_id: NodeId,
@@ -104,16 +105,16 @@ impl AttachmentService {
node_id: Some(pageserver_id),
};
let response = self.client.post(url).json(&request).send()?;
let response = self.client.post(url).json(&request).send().await?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
}
let response = response.json::<AttachHookResponse>()?;
let response = response.json::<AttachHookResponse>().await?;
Ok(response.gen)
}
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
use hyper::StatusCode;
let url = self
@@ -126,12 +127,12 @@ impl AttachmentService {
let request = InspectRequest { tenant_id };
let response = self.client.post(url).json(&request).send()?;
let response = self.client.post(url).json(&request).send().await?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
}
let response = response.json::<InspectResponse>()?;
let response = response.json::<InspectResponse>().await?;
Ok(response.attachment)
}
}

View File

@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
pub enum InitialPidFile<'t> {
pub enum InitialPidFile {
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
Create(&'t Utf8Path),
Create(Utf8PathBuf),
/// The process will create the pidfile itself, need to wait for that event.
Expect(&'t Utf8Path),
Expect(Utf8PathBuf),
}
/// Start a background child process using the parameters given.
pub fn start_process<F, AI, A, EI>(
pub async fn start_process<F, Fut, AI, A, EI>(
process_name: &str,
datadir: &Path,
command: &Path,
@@ -62,7 +62,8 @@ pub fn start_process<F, AI, A, EI>(
process_status_check: F,
) -> anyhow::Result<Child>
where
F: Fn() -> anyhow::Result<bool>,
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
AI: IntoIterator<Item = A>,
A: AsRef<OsStr>,
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -89,7 +90,7 @@ where
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
let pid_file_to_check = match initial_pid_file {
let pid_file_to_check = match &initial_pid_file {
InitialPidFile::Create(path) => {
pre_exec_create_pidfile(filled_cmd, path);
path
@@ -107,7 +108,7 @@ where
);
for retries in 0..RETRIES {
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
@@ -316,22 +317,20 @@ where
cmd
}
fn process_started<F>(
async fn process_started<F, Fut>(
pid: Pid,
pid_file_to_check: Option<&Utf8Path>,
pid_file_to_check: &Utf8Path,
status_check: &F,
) -> anyhow::Result<bool>
where
F: Fn() -> anyhow::Result<bool>,
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
{
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
},
None => Ok(true),
match status_check().await {
Ok(true) => match pid_file::read(pid_file_to_check)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
},
Ok(false) => Ok(false),
Err(e) => anyhow::bail!("process failed to start: {e}"),

View File

@@ -120,15 +120,20 @@ fn main() -> Result<()> {
let mut env = LocalEnv::load_config().context("Error loading config")?;
let original_env = env.clone();
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let subcommand_result = match sub_name {
"tenant" => handle_tenant(sub_args, &mut env),
"timeline" => handle_timeline(sub_args, &mut env),
"start" => handle_start_all(sub_args, &env),
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => handle_pageserver(sub_args, &env),
"attachment_service" => handle_attachment_service(sub_args, &env),
"safekeeper" => handle_safekeeper(sub_args, &env),
"endpoint" => handle_endpoint(sub_args, &env),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
"mappings" => handle_mappings(sub_args, &mut env),
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
_ => bail!("unexpected subcommand {sub_name}"),
@@ -269,12 +274,13 @@ fn print_timeline(
/// Returns a map of timeline IDs to timeline_id@lsn strings.
/// Connects to the pageserver to query this information.
fn get_timeline_infos(
async fn get_timeline_infos(
env: &local_env::LocalEnv,
tenant_id: &TenantId,
) -> Result<HashMap<TimelineId, TimelineInfo>> {
Ok(get_default_pageserver(env)
.timeline_list(tenant_id)?
.timeline_list(tenant_id)
.await?
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
.collect())
@@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
.collect()
}
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
async fn handle_tenant(
tenant_match: &ArgMatches,
env: &mut local_env::LocalEnv,
) -> anyhow::Result<()> {
let pageserver = get_default_pageserver(env);
match tenant_match.subcommand() {
Some(("list", _)) => {
for t in pageserver.tenant_list()? {
for t in pageserver.tenant_list().await? {
println!("{} {:?}", t.id, t.state);
}
}
@@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
// We must register the tenant with the attachment service, so
// that when the pageserver restarts, it will be re-attached.
let attachment_service = AttachmentService::from_env(env);
attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
attachment_service
.attach_hook(tenant_id, pageserver.conf.id)
.await?
} else {
None
};
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
pageserver
.tenant_create(tenant_id, generation, tenant_conf)
.await?;
println!("tenant {tenant_id} successfully created on the pageserver");
// Create an initial timeline for the new tenant
@@ -409,14 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
.copied()
.context("Failed to parse postgres version from the argument string")?;
let timeline_info = pageserver.timeline_create(
tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
None,
)?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -450,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
pageserver
.tenant_config(tenant_id, tenant_conf)
.await
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured on the pageserver");
}
@@ -458,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
let new_pageserver = get_pageserver(env, matches)?;
let new_pageserver_id = new_pageserver.conf.id;
migrate_tenant(env, tenant_id, new_pageserver)?;
migrate_tenant(env, tenant_id, new_pageserver).await?;
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
}
@@ -468,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
Ok(())
}
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let pageserver = get_default_pageserver(env);
match timeline_match.subcommand() {
Some(("list", list_match)) => {
let tenant_id = get_tenant_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_id)?;
let timelines = pageserver.timeline_list(&tenant_id).await?;
print_timelines_tree(timelines, env.timeline_name_mappings())?;
}
Some(("create", create_match)) => {
@@ -490,14 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
let new_timeline_id_opt = parse_timeline_id(create_match)?;
let timeline_info = pageserver.timeline_create(
tenant_id,
new_timeline_id_opt,
None,
None,
Some(pg_version),
None,
)?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
new_timeline_id_opt,
None,
None,
Some(pg_version),
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -542,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
let mut cplane = ComputeControlPlane::load(env.clone())?;
println!("Importing timeline into pageserver ...");
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
pageserver
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
.await?;
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
println!("Creating endpoint for imported timeline ...");
@@ -578,14 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let timeline_info = pageserver.timeline_create(
tenant_id,
None,
start_lsn,
Some(ancestor_timeline_id),
None,
None,
)?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
None,
start_lsn,
Some(ancestor_timeline_id),
None,
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -604,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
Ok(())
}
fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match ep_match.subcommand() {
Some(ep_subcommand_data) => ep_subcommand_data,
None => bail!("no endpoint subcommand provided"),
@@ -614,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
match sub_name {
"list" => {
let tenant_id = get_tenant_id(sub_args, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
let timeline_infos = get_timeline_infos(env, &tenant_id)
.await
.unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
let timeline_name_mappings = env.timeline_name_mappings();
@@ -791,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
};
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
endpoint
.start(&auth_token, safekeepers, remote_ext_config)
.await?;
}
"reconfigure" => {
let endpoint_id = sub_args
@@ -809,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
} else {
None
};
endpoint.reconfigure(pageserver_id)?;
endpoint.reconfigure(pageserver_id).await?;
}
"stop" => {
let endpoint_id = sub_args
@@ -875,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
))
}
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
@@ -906,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
exit(1);
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -920,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
exit(1);
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status() {
match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
@@ -942,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
Ok(())
}
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
async fn handle_attachment_service(
sub_match: &ArgMatches,
env: &local_env::LocalEnv,
) -> Result<()> {
let svc = AttachmentService::from_env(env);
match sub_match.subcommand() {
Some(("start", _start_match)) => {
if let Err(e) = svc.start() {
if let Err(e) = svc.start().await {
eprintln!("start failed: {e}");
exit(1);
}
@@ -987,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
.collect()
}
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(safekeeper_command_data) => safekeeper_command_data,
None => bail!("no safekeeper subcommand provided"),
@@ -1005,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
"start" => {
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts) {
if let Err(e) = safekeeper.start(extra_opts).await {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1031,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts) {
if let Err(e) = safekeeper.start(extra_opts).await {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1044,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
Ok(())
}
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically
broker::start_broker_process(env)?;
broker::start_broker_process(env).await?;
// Only start the attachment service if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start() {
if let Err(e) = attachment_service.start().await {
eprintln!("attachment_service start failed: {:#}", e);
try_stop_all(env, true);
exit(1);
@@ -1061,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true);
exit(1);
@@ -1070,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]) {
if let Err(e) = safekeeper.start(vec![]).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false);
exit(1);

View File

@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
use crate::{background_process, local_env};
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;
@@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let args = [format!("--listen-addr={listen_addr}")];
let client = reqwest::blocking::Client::new();
let client = reqwest::Client::new();
background_process::start_process(
"storage_broker",
&env.base_data_dir,
&env.storage_broker_bin(),
args,
[],
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|| {
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|| async {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
format!("Failed to append /status path to broker endpoint {url}")
@@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
.get(status_url)
.build()
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
match client.execute(request) {
match client.execute(request).await {
Ok(resp) => Ok(resp.status().is_success()),
Err(_) => Ok(false),
}
},
)
.await
.context("Failed to spawn storage_broker subprocess")?;
Ok(())
}

View File

@@ -464,7 +464,7 @@ impl Endpoint {
}
}
pub fn start(
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
@@ -587,7 +587,7 @@ impl Endpoint {
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
loop {
attempt += 1;
match self.get_status() {
match self.get_status().await {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
@@ -629,8 +629,8 @@ impl Endpoint {
}
// Call the /status HTTP API
pub fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::blocking::Client::new();
pub async fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::Client::new();
let response = client
.request(
@@ -641,16 +641,17 @@ impl Endpoint {
self.http_address.port()
),
)
.send()?;
.send()
.await?;
// Interpret the response
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(response.json()?)
Ok(response.json().await?)
} else {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = response.url().to_owned();
let msg = match response.text() {
let msg = match response.text().await {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
@@ -658,7 +659,7 @@ impl Endpoint {
}
}
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
@@ -687,7 +688,7 @@ impl Endpoint {
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
}
let client = reqwest::blocking::Client::new();
let client = reqwest::Client::new();
let response = client
.post(format!(
"http://{}:{}/configure",
@@ -698,14 +699,15 @@ impl Endpoint {
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.send()?;
.send()
.await?;
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(())
} else {
let url = response.url().to_owned();
let msg = match response.text() {
let msg = match response.text().await {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};

View File

@@ -6,28 +6,24 @@
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Write};
use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::time::Duration;
use std::{io, result};
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
};
use futures::SinkExt;
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::auth::{Claims, Scope};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
};
@@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
#[derive(Error, Debug)]
pub enum PageserverHttpError {
#[error("Reqwest error: {0}")]
Transport(#[from] reqwest::Error),
#[error("Error: {0}")]
Response(String),
}
impl From<anyhow::Error> for PageserverHttpError {
fn from(e: anyhow::Error) -> Self {
Self::Response(e.to_string())
}
}
type Result<T> = result::Result<T, PageserverHttpError>;
pub trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> Result<Self>;
}
impl ResponseErrorMessageExt for Response {
fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = self.url().to_owned();
Err(PageserverHttpError::Response(
match self.json::<HttpErrorBody>() {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
))
}
}
//
// Control routines for pageserver.
//
@@ -87,8 +44,7 @@ pub struct PageServerNode {
pub pg_connection_config: PgConnectionConfig,
pub conf: PageServerConf,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
pub http_client: mgmt_api::Client,
}
impl PageServerNode {
@@ -100,8 +56,19 @@ impl PageServerNode {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: Client::new(),
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
http_client: mgmt_api::Client::new(
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
AuthType::NeonJWT => Some(
env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap(),
),
}
}
.as_deref(),
),
}
}
@@ -182,8 +149,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false)
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -224,7 +191,12 @@ impl PageServerNode {
Ok(())
}
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
async fn start_node(
&self,
config_overrides: &[&str],
update_config: bool,
) -> anyhow::Result<Child> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}",
@@ -232,7 +204,7 @@ impl PageServerNode {
self.pg_connection_config.raw_address(),
datadir
);
io::stdout().flush()?;
io::stdout().flush().context("flush stdout")?;
let datadir_path_str = datadir.to_str().with_context(|| {
format!(
@@ -244,20 +216,23 @@ impl PageServerNode {
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
background_process::start_process(
"pageserver",
&datadir,
&self.env.pageserver_bin(),
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(PageserverHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
let st = self.check_status().await;
match st {
Ok(()) => Ok(true),
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
}
},
)
.await
}
fn pageserver_basic_args<'a>(
@@ -303,7 +278,12 @@ impl PageServerNode {
background_process::stop_process(immediate, "pageserver", &self.pid_file())
}
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
pub async fn page_server_psql_client(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
let mut config = self.pg_connection_config.clone();
if self.conf.pg_auth_type == AuthType::NeonJWT {
let token = self
@@ -311,36 +291,18 @@ impl PageServerNode {
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
config = config.set_password(Some(token));
}
Ok(config.connect_no_tls()?)
Ok(config.connect_no_tls().await?)
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
let mut builder = self.http_client.request(method, url);
if self.conf.http_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
builder = builder.bearer_auth(token)
}
Ok(builder)
pub async fn check_status(&self) -> mgmt_api::Result<()> {
self.http_client.status().await
}
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
.send()?
.error_from_body()?;
Ok(())
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
self.http_client.list_tenants().await
}
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
Ok(self
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
.send()?
.error_from_body()?
.json()?)
}
pub fn tenant_create(
pub async fn tenant_create(
&self,
new_tenant_id: TenantId,
generation: Option<u32>,
@@ -407,6 +369,7 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
};
let request = models::TenantCreateRequest {
@@ -417,23 +380,10 @@ impl PageServerNode {
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
.json(&request)
.send()?
.error_from_body()?
.json::<Option<String>>()
.with_context(|| {
format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
})?
.context("No tenant id was found in the tenant creation response")
.and_then(|tenant_id_string| {
tenant_id_string.parse().with_context(|| {
format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
})
})
Ok(self.http_client.tenant_create(&request).await?)
}
pub fn tenant_config(
pub async fn tenant_config(
&self,
tenant_id: TenantId,
mut settings: HashMap<&str, &str>,
@@ -504,6 +454,7 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
}
};
@@ -511,54 +462,30 @@ impl PageServerNode {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
.json(&models::TenantConfigRequest { tenant_id, config })
.send()?
.error_from_body()?;
self.http_client
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
.await?;
Ok(())
}
pub fn location_config(
pub async fn location_config(
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<Duration>,
) -> anyhow::Result<()> {
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/tenant/{}/location_config",
self.http_base_url, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.http_request(Method::PUT, path)?
.json(&req_body)
.send()?
.error_from_body()?;
Ok(())
Ok(self
.http_client
.location_config(tenant_id, config, flush_ms)
.await?)
}
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
let timeline_infos: Vec<TimelineInfo> = self
.http_request(
Method::GET,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)?
.send()?
.error_from_body()?
.json()?;
Ok(timeline_infos)
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
Ok(self.http_client.list_timelines(*tenant_id).await?)
}
pub fn timeline_create(
pub async fn timeline_create(
&self,
tenant_id: TenantId,
new_timeline_id: Option<TimelineId>,
@@ -569,29 +496,14 @@ impl PageServerNode {
) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
self.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)?
.json(&models::TimelineCreateRequest {
let req = models::TimelineCreateRequest {
new_timeline_id,
ancestor_start_lsn,
ancestor_timeline_id,
pg_version,
existing_initdb_timeline_id,
})
.send()?
.error_from_body()?
.json::<Option<TimelineInfo>>()
.with_context(|| {
format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
})?
.with_context(|| {
format!(
"No timeline id was found in the timeline creation response for tenant {tenant_id}"
)
})
};
Ok(self.http_client.timeline_create(tenant_id, &req).await?)
}
/// Import a basebackup prepared using either:
@@ -603,7 +515,7 @@ impl PageServerNode {
/// * `timeline_id` - id to assign to imported timeline
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
pub fn timeline_import(
pub async fn timeline_import(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -611,36 +523,60 @@ impl PageServerNode {
pg_wal: Option<(Lsn, PathBuf)>,
pg_version: u32,
) -> anyhow::Result<()> {
let mut client = self.page_server_psql_client()?;
let (client, conn) = self.page_server_psql_client().await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
tokio::pin!(client);
// Init base reader
let (start_lsn, base_tarfile_path) = base;
let base_tarfile = File::open(base_tarfile_path)?;
let mut base_reader = BufReader::new(base_tarfile);
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
// Init wal reader if necessary
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
let wal_tarfile = File::open(wal_tarfile_path)?;
let wal_reader = BufReader::new(wal_tarfile);
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
(end_lsn, Some(wal_reader))
} else {
(start_lsn, None)
};
// Import base
let import_cmd = format!(
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
);
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut base_reader, &mut writer)?;
writer.finish()?;
let copy_in = |reader, cmd| {
let client = &client;
async move {
let writer = client.copy_in(&cmd).await?;
let writer = std::pin::pin!(writer);
let mut writer = writer.sink_map_err(|e| {
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
});
let mut reader = std::pin::pin!(reader);
writer.send_all(&mut reader).await?;
writer.into_inner().finish().await?;
anyhow::Ok(())
}
};
// Import base
copy_in(
base_tarfile,
format!(
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
),
)
.await?;
// Import wal if necessary
if let Some(mut wal_reader) = wal_reader {
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut wal_reader, &mut writer)?;
writer.finish()?;
if let Some(wal_reader) = wal_reader {
copy_in(
wal_reader,
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
)
.await?;
}
Ok(())

View File

@@ -13,7 +13,6 @@ use std::{io, result};
use anyhow::Context;
use camino::Utf8PathBuf;
use postgres_connection::PgConnectionConfig;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -34,12 +33,14 @@ pub enum SafekeeperHttpError {
type Result<T> = result::Result<T, SafekeeperHttpError>;
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> Result<Self>;
async fn error_from_body(self) -> Result<Self>;
}
impl ResponseErrorMessageExt for Response {
fn error_from_body(self) -> Result<Self> {
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
@@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = self.url().to_owned();
Err(SafekeeperHttpError::Response(
match self.json::<HttpErrorBody>() {
match self.json::<HttpErrorBody>().await {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
@@ -69,7 +70,7 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv,
pub http_client: Client,
pub http_client: reqwest::Client,
pub http_base_url: String,
}
@@ -80,7 +81,7 @@ impl SafekeeperNode {
conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(),
http_client: Client::new(),
http_client: reqwest::Client::new(),
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
}
}
@@ -103,7 +104,7 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),
@@ -191,13 +192,16 @@ impl SafekeeperNode {
&self.env.safekeeper_bin(),
&args,
[],
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
match self.check_status().await {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
}
},
)
.await
}
///
@@ -216,7 +220,7 @@ impl SafekeeperNode {
)
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
// TODO: authentication
//if self.env.auth_type == AuthType::NeonJWT {
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -224,10 +228,12 @@ impl SafekeeperNode {
self.http_client.request(method, url)
}
pub fn check_status(&self) -> Result<()> {
pub async fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
.send()?
.error_from_body()?;
.send()
.await?
.error_from_body()
.await?;
Ok(())
}
}

View File

@@ -19,11 +19,11 @@ use utils::{
};
/// Given an attached pageserver, retrieve the LSN for all timelines
fn get_lsns(
async fn get_lsns(
tenant_id: TenantId,
pageserver: &PageServerNode,
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
let timelines = pageserver.timeline_list(&tenant_id)?;
let timelines = pageserver.timeline_list(&tenant_id).await?;
Ok(timelines
.into_iter()
.map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +32,13 @@ fn get_lsns(
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
/// `baseline`.
fn await_lsn(
async fn await_lsn(
tenant_id: TenantId,
pageserver: &PageServerNode,
baseline: HashMap<TimelineId, Lsn>,
) -> anyhow::Result<()> {
loop {
let latest = match get_lsns(tenant_id, pageserver) {
let latest = match get_lsns(tenant_id, pageserver).await {
Ok(l) => l,
Err(e) => {
println!(
@@ -84,7 +84,7 @@ fn await_lsn(
/// - Coordinate attach/secondary/detach on pageservers
/// - call into attachment_service for generations
/// - reconfigure compute endpoints to point to new attached pageserver
pub fn migrate_tenant(
pub async fn migrate_tenant(
env: &LocalEnv,
tenant_id: TenantId,
dest_ps: PageServerNode,
@@ -108,16 +108,18 @@ pub fn migrate_tenant(
}
}
let previous = attachment_service.inspect(tenant_id)?;
let previous = attachment_service.inspect(tenant_id).await?;
let mut baseline_lsns = None;
if let Some((generation, origin_ps_id)) = &previous {
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
if origin_ps_id == &dest_ps.conf.id {
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
println!("✅ Migration complete");
return Ok(());
}
@@ -126,20 +128,24 @@ pub fn migrate_tenant(
let stale_conf =
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
origin_ps
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
.await?;
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
}
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
if let Some(baseline) = baseline_lsns {
println!("🕑 Waiting for LSN to catch up...");
await_lsn(tenant_id, &dest_ps, baseline)?;
await_lsn(tenant_id, &dest_ps, baseline).await?;
}
let cplane = ComputeControlPlane::load(env.clone())?;
@@ -149,7 +155,7 @@ pub fn migrate_tenant(
"🔁 Reconfiguring endpoint {} to use pageserver {}",
endpoint_name, dest_ps.conf.id
);
endpoint.reconfigure(Some(dest_ps.conf.id))?;
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
}
}
@@ -159,7 +165,7 @@ pub fn migrate_tenant(
}
let other_ps = PageServerNode::from_env(env, other_ps_conf);
let other_ps_tenants = other_ps.tenant_list()?;
let other_ps_tenants = other_ps.tenant_list().await?;
// Check if this tenant is attached
let found = other_ps_tenants
@@ -181,7 +187,9 @@ pub fn migrate_tenant(
"💤 Switching to secondary mode on pageserver {}",
other_ps.conf.id
);
other_ps.location_config(tenant_id, secondary_conf, None)?;
other_ps
.location_config(tenant_id, secondary_conf, None)
.await?;
}
println!(
@@ -189,7 +197,7 @@ pub fn migrate_tenant(
dest_ps.conf.id
);
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
println!("✅ Migration complete");

View File

@@ -3,8 +3,11 @@
//! Otherwise, we might not see all metrics registered via
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy;
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
};
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
@@ -132,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage {
rusage.assume_init()
}
}
/// Create an [`IntCounterPairVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair_vec {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
match (
$crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
$crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
match (
$crate::register_int_counter!($NAME1, $HELP1),
$crate::register_int_counter!($NAME2, $HELP2),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
pub struct GenericCounterPairVec<P: Atomic> {
inc: GenericCounterVec<P>,
dec: GenericCounterVec<P>,
}
/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
pub struct GenericCounterPair<P: Atomic> {
inc: GenericCounter<P>,
dec: GenericCounter<P>,
}
impl<P: Atomic> GenericCounterPairVec<P> {
pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
Self { inc, dec }
}
/// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
})
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<P: Atomic> GenericCounterPair<P> {
pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
Self { inc, dec }
}
/// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
pub fn guard(&self) -> GenericCounterPairGuard<P> {
self.inc.inc();
GenericCounterPairGuard(self.dec.clone())
}
/// Increment the gauge by n, returning a guard that decrements by n on drop.
pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
self.inc.inc_by(n);
GenericCounterPairGuardBy(self.dec.clone(), n)
}
/// Increase the gauge by 1.
#[inline]
pub fn inc(&self) {
self.inc.inc();
}
/// Decrease the gauge by 1.
#[inline]
pub fn dec(&self) {
self.dec.inc();
}
/// Add the given value to the gauge. (The value can be
/// negative, resulting in a decrement of the gauge.)
#[inline]
pub fn inc_by(&self, v: P::T) {
self.inc.inc_by(v);
}
/// Subtract the given value from the gauge. (The value can be
/// negative, resulting in an increment of the gauge.)
#[inline]
pub fn dec_by(&self, v: P::T) {
self.dec.inc_by(v);
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
fn drop(&mut self) {
self.0.inc();
}
}
/// Guard returned by [`GenericCounterPair::guard_by`]
pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
fn drop(&mut self) {
self.0.inc_by(self.1);
}
}
/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;

View File

@@ -24,3 +24,4 @@ workspace_hack.workspace = true
[dev-dependencies]
bincode.workspace = true
rand.workspace = true

View File

@@ -144,3 +144,37 @@ impl Key {
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
Self::from_hex(s)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use crate::key::Key;
use rand::Rng;
use rand::SeedableRng;
#[test]
fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
let key = Key {
field1: rng.gen(),
field2: rng.gen(),
field3: rng.gen(),
field4: rng.gen(),
field5: rng.gen(),
field6: rng.gen(),
};
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
}
}

View File

@@ -1,11 +1,12 @@
use crate::repository::{key_range_size, singleton_range, Key};
use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
///
/// Represents a set of Keys, in a compact form.
///
#[derive(Clone, Debug, Default)]
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct KeySpace {
/// Contiguous ranges of keys that belong to the key space. In key order,
/// and with no overlap.
@@ -186,6 +187,33 @@ impl KeySpaceRandomAccum {
}
}
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next()
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -5,6 +5,7 @@ use const_format::formatcp;
/// Public API types
pub mod control_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod reltag;
pub mod shard;

View File

@@ -1,3 +1,5 @@
pub mod partitioning;
use std::{
collections::HashMap,
num::{NonZeroU64, NonZeroUsize},
@@ -237,6 +239,7 @@ pub struct TenantConfig {
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which

View File

@@ -0,0 +1,151 @@
use utils::lsn::Lsn;
#[derive(Debug, PartialEq, Eq)]
pub struct Partitioning {
pub keys: crate::keyspace::KeySpace,
pub at_lsn: Lsn,
}
impl serde::Serialize for Partitioning {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
impl<'a> serde::Serialize for KeySpace<'a> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
for kr in &self.0.ranges {
seq.serialize_element(&KeyRange(kr))?;
}
seq.end()
}
}
use serde::ser::SerializeMap;
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
}
}
pub struct WithDisplay<'a, T>(&'a T);
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(&self.0)
}
}
pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
impl<'a> serde::Serialize for KeyRange<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeTuple;
let mut t = serializer.serialize_tuple(2)?;
t.serialize_element(&WithDisplay(&self.0.start))?;
t.serialize_element(&WithDisplay(&self.0.end))?;
t.end()
}
}
impl<'a> serde::Deserialize<'a> for Partitioning {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'a>,
{
pub struct KeySpace(crate::keyspace::KeySpace);
impl<'de> serde::Deserialize<'de> for KeySpace {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
#[serde(transparent)]
struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
struct Range(Key, Key);
let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
Ok(Self(crate::keyspace::KeySpace {
ranges: ranges
.into_iter()
.map(|Range(start, end)| (start.0..end.0))
.collect(),
}))
}
}
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
struct De {
keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn,
}
let de: De = serde::Deserialize::deserialize(deserializer)?;
Ok(Self {
at_lsn: de.at_lsn,
keys: de.keys.0,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_serialization_roundtrip() {
let reference = r#"
{
"keys": [
[
"000000000000000000000000000000000000",
"000000000000000000000000000000000001"
],
[
"000000067F00000001000000000000000000",
"000000067F00000001000000000000000002"
],
[
"030000000000000000000000000000000000",
"030000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160"
}
"#;
let de: Partitioning = serde_json::from_str(reference).unwrap();
let ser = serde_json::to_string(&de).unwrap();
let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
assert_eq!(
ser_de,
serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
);
}
}

View File

@@ -163,8 +163,18 @@ impl PgConnectionConfig {
}
/// Connect using postgres protocol with TLS disabled.
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
pub async fn connect_no_tls(
&self,
) -> Result<
(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
),
postgres::Error,
> {
self.to_tokio_postgres_config()
.connect(postgres::NoTls)
.await
}
}

View File

@@ -50,6 +50,8 @@ const_format.workspace = true
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
serde_path_to_error.workspace = true
[dev-dependencies]
byteorder.workspace = true
bytes.workspace = true

View File

@@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
if body.remaining() == 0 {
return Ok(None);
}
serde_json::from_reader(body.reader())
.context("Failed to parse json request")
let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
serde_path_to_error::deserialize(&mut deser)
// intentionally stringify because the debug version is not helpful in python logs
.map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
.map(Some)
.map_err(ApiError::BadRequest)
}

View File

@@ -1,6 +1,7 @@
use std::str::FromStr;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, EnumVariantNames};
@@ -24,16 +25,48 @@ impl LogFormat {
}
}
static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
metrics::register_int_counter_vec!(
struct TracingEventCountMetric {
error: IntCounter,
warn: IntCounter,
info: IntCounter,
debug: IntCounter,
trace: IntCounter,
}
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
&["level"]
)
.expect("failed to define metric")
.expect("failed to define metric");
TracingEventCountMetric::new(vec)
});
struct TracingEventCountLayer(&'static metrics::IntCounterVec);
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
debug: vec.with_label_values(&["debug"]),
trace: vec.with_label_values(&["trace"]),
}
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,
tracing::Level::WARN => &self.warn,
tracing::Level::INFO => &self.info,
tracing::Level::DEBUG => &self.debug,
tracing::Level::TRACE => &self.trace,
};
counter.inc();
}
}
struct TracingEventCountLayer(&'static TracingEventCountMetric);
impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
where
@@ -44,15 +77,7 @@ where
event: &tracing::Event<'_>,
_ctx: tracing_subscriber::layer::Context<'_, S>,
) {
let level = event.metadata().level();
let level = match *level {
tracing::Level::ERROR => "error",
tracing::Level::WARN => "warn",
tracing::Level::INFO => "info",
tracing::Level::DEBUG => "debug",
tracing::Level::TRACE => "trace",
};
self.0.with_label_values(&[level]).inc();
self.0.inc_for_level(*event.metadata().level());
}
}
@@ -106,7 +131,9 @@ pub fn init(
};
log_layer.with_filter(rust_log_env_filter())
});
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
let r = r.with(
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
);
match tracing_error_layer_enablement {
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -257,14 +284,14 @@ impl std::fmt::Debug for SecretString {
mod tests {
use metrics::{core::Opts, IntCounterVec};
use super::TracingEventCountLayer;
use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
#[test]
fn tracing_event_count_metric() {
let counter_vec =
IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
let layer = TracingEventCountLayer(counter_vec);
let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
let layer = TracingEventCountLayer(metric);
use tracing_subscriber::prelude::*;
tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {

View File

@@ -2,8 +2,11 @@ use std::time::Duration;
use tokio_util::sync::CancellationToken;
#[derive(thiserror::Error, Debug)]
pub enum TimeoutCancellableError {
#[error("Timed out")]
Timeout,
#[error("Cancelled")]
Cancelled,
}

View File

@@ -1,3 +1,6 @@
//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
//! to generate Rust bindings for it.
use std::{env, path::PathBuf, process::Command};
use anyhow::{anyhow, Context};

View File

@@ -1,3 +1,6 @@
//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
//! callback_data stores Box to some Rust implementation.
#![allow(dead_code)]
use std::ffi::CStr;

View File

@@ -436,9 +436,9 @@ mod tests {
event_mask: 0,
}),
expected_messages: vec![
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
// walproposer will panic when it finishes sync_safekeepers
std::panic::catch_unwind(|| wp.start()).unwrap_err();
// validate the resulting LSN
assert_eq!(receiver.recv()?, 1337);
assert_eq!(receiver.try_recv(), Ok(1337));
Ok(())
// drop() will free up resources here
}

View File

@@ -36,6 +36,7 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }

View File

@@ -0,0 +1,14 @@
[package]
name = "pageserver_client"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
pageserver_api.workspace = true
thiserror.workspace = true
async-trait.workspace = true
reqwest.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1 @@
pub mod mgmt_api;

View File

@@ -0,0 +1,188 @@
use pageserver_api::models::*;
use reqwest::{IntoUrl, Method};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
};
#[derive(Debug)]
pub struct Client {
mgmt_api_endpoint: String,
authorization_header: Option<String>,
client: reqwest::Client,
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("receive body: {0}")]
ReceiveBody(reqwest::Error),
#[error("receive error body: {0}")]
ReceiveErrorBody(String),
#[error("pageserver API: {0}")]
ApiError(String),
}
pub type Result<T> = std::result::Result<T, Error>;
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
async fn error_from_body(self) -> Result<Self>;
}
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(mut self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
let url = self.url().to_owned();
Err(match self.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
}
})
}
}
impl Client {
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self {
mgmt_api_endpoint,
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
client: reqwest::Client::new(),
}
}
pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
let resp = self.get(&uri).await?;
resp.json().await.map_err(Error::ReceiveBody)
}
pub async fn list_timelines(
&self,
tenant_id: TenantId,
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn timeline_info(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn keyspace(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint
);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
self.request(Method::GET, uri, ()).await
}
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
body: B,
) -> Result<reqwest::Response> {
let req = self.client.request(method, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
let response = res.error_from_body().await?;
Ok(response)
}
pub async fn status(&self) -> Result<()> {
let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
self.get(&uri).await?;
Ok(())
}
pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
self.request(Method::POST, &uri, req)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
Ok(())
}
pub async fn location_config(
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<std::time::Duration>,
) -> Result<()> {
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/v1/tenant/{}/location_config",
self.mgmt_api_endpoint, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.request(Method::PUT, &path, &req_body).await?;
Ok(())
}
pub async fn timeline_create(
&self,
tenant_id: TenantId,
req: &TimelineCreateRequest,
) -> Result<TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{}/timeline",
self.mgmt_api_endpoint, tenant_id
);
self.request(Method::POST, &uri, req)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::TenantSharedResources;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -504,6 +504,17 @@ fn start_pageserver(
}
});
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
@@ -533,6 +544,7 @@ fn start_pageserver(
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
)
.context("Failed to initialize router state")?,
);

View File

@@ -41,6 +41,8 @@ use crate::{
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
pub mod defaults {
use crate::tenant::config::defaults::*;
use const_format::formatcp;
@@ -61,6 +63,8 @@ pub mod defaults {
pub const DEFAULT_LOG_FORMAT: &str = "plain";
pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
@@ -70,6 +74,8 @@ pub mod defaults {
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
///
/// Default built-in configuration file.
///
@@ -92,6 +98,7 @@ pub mod defaults {
#log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -117,6 +124,8 @@ pub mod defaults {
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
[remote_storage]
"#
@@ -176,6 +185,11 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
/// loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -215,6 +229,10 @@ pub struct PageServerConf {
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -275,6 +293,7 @@ struct PageServerConfigBuilder {
log_format: BuilderValue<LogFormat>,
concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
metric_collection_interval: BuilderValue<Duration>,
@@ -293,6 +312,8 @@ struct PageServerConfigBuilder {
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
}
impl Default for PageServerConfigBuilder {
@@ -330,6 +351,8 @@ impl Default for PageServerConfigBuilder {
.expect("cannot parse default keepalive interval")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant")),
concurrent_tenant_size_logical_size_queries: Set(
ConfigurableSemaphore::DEFAULT_INITIAL,
),
@@ -361,6 +384,8 @@ impl Default for PageServerConfigBuilder {
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
}
}
}
@@ -441,6 +466,10 @@ impl PageServerConfigBuilder {
self.log_format = BuilderValue::Set(log_format)
}
pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
self.concurrent_tenant_warmup = BuilderValue::Set(u);
}
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
@@ -501,7 +530,14 @@ impl PageServerConfigBuilder {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
}
pub fn heatmap_upload_concurrency(&mut self, value: usize) {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
@@ -554,6 +590,7 @@ impl PageServerConfigBuilder {
.broker_keepalive_interval
.ok_or(anyhow!("No broker keepalive interval provided"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
concurrent_tenant_size_logical_size_queries,
),
@@ -595,6 +632,10 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
})
}
}
@@ -787,6 +828,11 @@ impl PageServerConf {
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
"concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
}),
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -828,7 +874,9 @@ impl PageServerConf {
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
_ => bail!("unrecognized pageserver option '{key}'"),
}
@@ -882,6 +930,10 @@ impl PageServerConf {
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5000),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant"),
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
),
@@ -896,6 +948,7 @@ impl PageServerConf {
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
}
}
}
@@ -1099,6 +1152,9 @@ background_task_maximum_delay = '334 s'
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
)?,
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
@@ -1120,7 +1176,8 @@ background_task_maximum_delay = '334 s'
)?,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
},
"Correct defaults should be used when no config values are provided"
);
@@ -1164,6 +1221,9 @@ background_task_maximum_delay = '334 s'
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5),
log_format: LogFormat::Json,
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
@@ -1177,7 +1237,8 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -3,7 +3,7 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker(
info!("calculate_synthetic_size_worker stopped");
};
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
loop {
let started_at = Instant::now();
@@ -280,29 +278,14 @@ async fn calculate_synthetic_size_worker(
continue;
}
if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
let is_cancelled = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled)
);
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
continue;
};
if !is_cancelled {
error!(
"failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
);
}
}
}
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
calculate_and_log(&tenant, cancel, ctx).await;
}
crate::tenant::tasks::warn_when_period_overrun(
@@ -321,3 +304,31 @@ async fn calculate_synthetic_size_worker(
}
}
}
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
const CAUSE: LogicalSizeCalculationCause =
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
return;
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
}

View File

@@ -42,7 +42,6 @@
// reading these fields. We use the Debug impl for semi-structured logging, though.
use std::{
collections::HashMap,
sync::Arc,
time::{Duration, SystemTime},
};
@@ -125,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task(
async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
_storage: &GenericRemoteStorage,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: CancellationToken,
) {
@@ -149,8 +148,14 @@ async fn disk_usage_eviction_task(
let start = Instant::now();
async {
let res =
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
let res = disk_usage_eviction_task_iteration(
state,
task_config,
storage,
tenants_dir,
&cancel,
)
.await;
match res {
Ok(()) => {}
@@ -181,12 +186,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -268,8 +274,9 @@ struct LayerCount {
count: usize,
}
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
@@ -321,16 +328,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Walk through the list of candidates, until we have accumulated enough layers to get
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
// how much disk space would be used after evicting all the layers up to the current
// point in the list. The layers are collected in 'batched', grouped per timeline.
// point in the list.
//
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
let mut max_batch_size = 0;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
@@ -339,25 +346,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
break;
}
if partition == MinResidentSizePartition::Below && warned.is_none() {
if partition == &MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
// tasks to evict all seen layers until we have evicted enough
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
evicted_amount += 1;
}
let usage_planned = match warned {
@@ -372,100 +367,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
};
debug!(?usage_planned, "usage planned");
// phase2: evict victims batched by timeline
// phase2: evict layers
let mut js = tokio::task::JoinSet::new();
let limit = 1000;
// ratelimit to 1k files or any higher max batch size
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
let mut consumed_all = false;
for (timeline, batch) in batched {
let tenant_shard_id = timeline.tenant_shard_id;
let timeline_id = timeline.timeline_id;
let batch_size =
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
// I dislike naming of `available_permits` but it means current total amount of permits
// because permits can be added
assert!(batch_size as usize <= limit.available_permits());
let evict_layers = async move {
loop {
let next = if js.len() >= limit || consumed_all {
js.join_next().await
} else if !js.is_empty() {
// opportunistically consume ready result, one per each new evicted
futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
} else {
None
};
debug!(%timeline_id, "evicting batch for timeline");
let evict = {
let limit = limit.clone();
let cancel = cancel.clone();
async move {
let mut evicted_bytes = 0;
let mut evictions_failed = LayerCount::default();
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
// semaphore closing means cancelled
return (evicted_bytes, evictions_failed);
};
let results = timeline.evict_layers(&batch).await;
match results {
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
let file_size = layer.layer_desc().file_size;
match result {
Some(Ok(())) => {
evicted_bytes += file_size;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
}
}
}
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
usage_assumed.add_available_bytes(file_size);
}
Err(e) => {
warn!("failed to evict batch: {:#}", e);
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
(evicted_bytes, evictions_failed)
}
}
.instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
js.spawn(evict);
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
// chance of making progress
tokio::task::yield_now().await;
}
let join_all = async move {
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
while let Some(res) = js.join_next().await {
match res {
Ok((evicted_bytes, failed)) => {
usage_assumed.add_available_bytes(evicted_bytes);
evictions_failed.file_sizes += failed.file_sizes;
evictions_failed.count += failed.count;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
if consumed_all && js.is_empty() {
break;
}
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
consumed_all = true;
continue;
};
js.spawn(async move {
let rtc = candidate.timeline.remote_client.as_ref().expect(
"holding the witness, all timelines must have a remote timeline client",
);
let file_size = candidate.layer.layer_desc().file_size;
candidate
.layer
.evict_and_wait(rtc)
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
});
tokio::task::yield_now().await;
}
(usage_assumed, evictions_failed)
};
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = join_all => { tuple },
tuple = evict_layers => { tuple },
_ = cancel.cancelled() => {
// close the semaphore to stop any pending acquires
limit.close();
// dropping joinset will abort all pending evict_and_waits and that is fine, our
// requests will still stand
return Ok(IterationOutcome::Cancelled);
}
};

View File

@@ -1,4 +1,2 @@
pub mod routes;
pub use routes::make_router;
pub use pageserver_api::models;

View File

@@ -992,8 +992,8 @@ paths:
type: string
post:
description: |
Create a timeline. Returns new timeline id on success.\
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
Create a timeline. Returns new timeline id on success.
Recreating the same timeline will succeed if the parameters match the existing timeline.
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
requestBody:
content:
@@ -1405,6 +1405,8 @@ components:
type: integer
trace_read_requests:
type: boolean
heatmap_period:
type: integer
TenantConfigResponse:
type: object
properties:

View File

@@ -28,20 +28,18 @@ use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use super::models::{
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::CompactFlags;
@@ -49,6 +47,10 @@ use crate::tenant::timeline::Timeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use utils::{
auth::SwappableJwtAuth,
generation::Generation,
@@ -64,7 +66,12 @@ use utils::{
};
// Imports only used for testing APIs
use super::models::ConfigureFailpointsRequest;
use pageserver_api::models::ConfigureFailpointsRequest;
// For APIs that require an Active tenant, how long should we block waiting for that state?
// This is not functionally necessary (clients will retry), but avoids generating a lot of
// failed API calls while tenants are activating.
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
pub struct State {
conf: &'static PageServerConf,
@@ -75,9 +82,11 @@ pub struct State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
}
impl State {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
@@ -86,6 +95,7 @@ impl State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
.iter()
@@ -100,6 +110,7 @@ impl State {
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
secondary_controller,
})
}
@@ -136,11 +147,6 @@ impl From<PageReconstructError> for ApiError {
fn from(pre: PageReconstructError) -> ApiError {
match pre {
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
PageReconstructError::NeedsDownload(_, _) => {
// This shouldn't happen, because we use a RequestContext that requests to
// download any missing layer files on-demand.
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
@@ -233,6 +239,19 @@ impl From<GetTenantError> for ApiError {
}
}
impl From<GetActiveTenantError> for ApiError {
fn from(e: GetActiveTenantError) -> ApiError {
match e {
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
GetActiveTenantError::NotFound(gte) => gte.into(),
GetActiveTenantError::WaitForActiveTimeout { .. } => {
ApiError::ResourceUnavailable(format!("{}", e).into())
}
}
}
}
impl From<SetNewTenantConfigError> for ApiError {
fn from(e: SetNewTenantConfigError) -> ApiError {
match e {
@@ -435,7 +454,10 @@ async fn timeline_create_handler(
let state = get_state(&request);
async {
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
match tenant.create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -453,7 +475,7 @@ async fn timeline_create_handler(
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
Err(tenant::CreateTimelineError::AlreadyExists) => {
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
json_response(StatusCode::CONFLICT, ())
}
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
@@ -694,11 +716,23 @@ async fn timeline_delete_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let state = get_state(&request);
state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
.instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)
.map_err(|e| {
match e {
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
// want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
"Requested tenant is missing".to_string().into_boxed_str(),
),
e => e.into(),
}
})?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
@@ -1136,7 +1170,10 @@ async fn tenant_create_handler(
// We created the tenant. Existing API semantics are that the tenant
// is Active when this function returns.
if let res @ Err(_) = new_tenant.wait_to_become_active().await {
if let res @ Err(_) = new_tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await
{
// This shouldn't happen because we just created the tenant directory
// in tenant::mgr::create_tenant, and there aren't any remote timelines
// to load, so, nothing can really fail during load.
@@ -1487,69 +1524,6 @@ async fn timeline_collect_keyspace(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
struct Partitioning {
keys: crate::keyspace::KeySpace,
at_lsn: Lsn,
}
impl serde::Serialize for Partitioning {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeMap;
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
}
}
struct WithDisplay<'a, T>(&'a T);
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(&self.0)
}
}
struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
impl<'a> serde::Serialize for KeySpace<'a> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
for kr in &self.0.ranges {
seq.serialize_element(&KeyRange(kr))?;
}
seq.end()
}
}
struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
impl<'a> serde::Serialize for KeyRange<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeTuple;
let mut t = serializer.serialize_tuple(2)?;
t.serialize_element(&WithDisplay(&self.0.start))?;
t.serialize_element(&WithDisplay(&self.0.end))?;
t.end()
}
}
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
async {
@@ -1561,7 +1535,9 @@ async fn timeline_collect_keyspace(
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
json_response(StatusCode::OK, res)
}
.instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await
@@ -1593,7 +1569,7 @@ async fn always_panic_handler(
async fn disk_usage_eviction_run(
mut r: Request<Body>,
_cancel: CancellationToken,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
@@ -1621,57 +1597,48 @@ async fn disk_usage_eviction_run(
}
}
let config = json_request::<Config>(&mut r)
.await
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
let config = json_request::<Config>(&mut r).await?;
let usage = Usage {
config,
freed_bytes: 0,
};
let (tx, rx) = tokio::sync::oneshot::channel();
let state = get_state(&r);
if state.remote_storage.as_ref().is_none() {
let Some(storage) = state.remote_storage.as_ref() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
}
};
let state = state.disk_usage_eviction_state.clone();
let cancel = CancellationToken::new();
let child_cancel = cancel.clone();
let _g = cancel.drop_guard();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state, storage, usage, &cancel,
)
.await;
crate::task_mgr::spawn(
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
"ondemand disk usage eviction",
false,
async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
usage,
&child_cancel,
)
.await;
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
let res = res.map_err(ApiError::InternalServerError)?;
let _ = tx.send(res);
Ok(())
}
.in_current_span(),
);
json_response(StatusCode::OK, res)
}
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
async fn secondary_upload_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
state
.secondary_controller
.upload_tenant(tenant_shard_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, response)
json_response(StatusCode::OK, ())
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1933,6 +1900,9 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler),
)
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler)
})
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})

View File

@@ -10,7 +10,7 @@ pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
pub use pageserver_api::keyspace;
pub mod metrics;
pub mod page_cache;
pub mod page_service;

View File

@@ -2,9 +2,10 @@ use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
@@ -683,14 +684,54 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
.expect("Failed to register pageserver_startup_is_loading")
});
/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
/// like how long it took to load.
///
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
/// metrics are rather expensive, and usually fine grained stuff makes more sense
/// at a timeline level than tenant level.
pub(crate) struct TenantMetrics {
/// How long did tenants take to go from construction to active state?
pub(crate) activation: Histogram,
pub(crate) preload: Histogram,
pub(crate) attach: Histogram,
/// How many tenants are included in the initial startup of the pagesrever?
pub(crate) startup_scheduled: IntCounter,
pub(crate) startup_complete: IntCounter,
}
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
TenantMetrics {
activation: register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register pageserver_tenant_activation_seconds metric")
.expect("Failed to register metric"),
preload: register_histogram!(
"pageserver_tenant_preload_seconds",
"Time taken by tenants to load remote metadata on startup/attach, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
attach: register_histogram!(
"pageserver_tenant_attach_seconds",
"Time taken by tenants to intialize, after remote metadata is already loaded",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
startup_scheduled: register_int_counter!(
"pageserver_tenant_startup_scheduled",
"Number of tenants included in pageserver startup (doesn't count tenants attached later)"
).expect("Failed to register metric"),
startup_complete: register_int_counter!(
"pageserver_tenant_startup_complete",
"Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
tenants: such cases will lead to this metric never reaching the scheduled count."
).expect("Failed to register metric"),
}
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -1270,6 +1311,28 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
)
.expect("failed to define a metric"),
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
)
.expect("failed to define a metric"),
upload_heatmap_errors: register_int_counter!(
"pageserver_secondary_upload_heatmap_errors",
"Failures writing heatmap to remote storage"
)
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
@@ -1321,25 +1384,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
@@ -2199,6 +2253,9 @@ pub fn preinitialize_metrics() {
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// Tenant stats
Lazy::force(&TENANT);
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);

View File

@@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::ops::{AddAssign, Range};
use std::ops::AddAssign;
use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next()
}
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq))]

View File

@@ -258,6 +258,9 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryUploads,
// Initial logical size calculation
InitialLogicalSizeCalculation,
@@ -558,9 +561,14 @@ pub async fn shutdown_watcher() {
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
/// `tokio::task::JoinSet::spawn`.
pub fn shutdown_token() -> CancellationToken {
SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_token() called in an unexpected task or thread")
let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
if cfg!(test) {
// in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
res.unwrap_or_default()
} else {
res.expect("shutdown_token() called in an unexpected task or thread")
}
}
/// Has the current task been requested to shut down?

View File

@@ -36,6 +36,8 @@ use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext;
use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
use utils::timeout::timeout_cancellable;
use utils::timeout::TimeoutCancellableError;
use self::config::AttachedLocationConfig;
use self::config::AttachmentMode;
@@ -48,6 +50,7 @@ use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::mgr::TenantsMap;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineExclusionError;
use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
@@ -58,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
use crate::metrics::TENANT;
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
use crate::repository::GcResult;
use crate::task_mgr;
@@ -87,7 +90,6 @@ use std::process::Stdio;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::MutexGuard;
use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant};
@@ -144,6 +146,7 @@ pub mod storage_layer;
pub mod config;
pub mod delete;
pub mod mgr;
pub mod secondary;
pub mod tasks;
pub mod upload_queue;
@@ -225,7 +228,7 @@ pub struct Tenant {
/// The value creation timestamp, used to measure activation delay, see:
/// <https://github.com/neondatabase/neon/issues/4025>
loading_started_at: Instant,
constructed_at: Instant,
state: watch::Sender<TenantState>,
@@ -248,6 +251,12 @@ pub struct Tenant {
generation: Generation,
timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
/// During timeline creation, we first insert the TimelineId to the
/// creating map, then `timelines`, then remove it from the creating map.
/// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
// This mutex prevents creation of new timelines during GC.
// Adding yet another mutex (in addition to `timelines`) is needed because holding
// `timelines` mutex during all GC iteration
@@ -269,6 +278,11 @@ pub struct Tenant {
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
/// background warmup.
pub(crate) activate_now_sem: tokio::sync::Semaphore,
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
// Cancellation token fires when we have entered shutdown(). This is a parent of
@@ -406,8 +420,10 @@ impl Debug for SetStoppingError {
#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
#[error("a timeline with the given ID already exists")]
AlreadyExists,
#[error("creation of timeline with the given ID is in progress")]
AlreadyCreating,
#[error("timeline already exists with different parameters")]
Conflict,
#[error(transparent)]
AncestorLsn(anyhow::Error),
#[error("ancestor timeline is not active")]
@@ -613,6 +629,14 @@ impl Tenant {
"attach tenant",
false,
async move {
// Is this tenant being spawned as part of process startup?
let starting_up = init_order.is_some();
scopeguard::defer! {
if starting_up {
TENANT.startup_complete.inc();
}
}
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
let make_broken =
|t: &Tenant, err: anyhow::Error| {
@@ -639,8 +663,62 @@ impl Tenant {
.as_mut()
.and_then(|x| x.initial_tenant_load_remote.take());
enum AttachType<'a> {
// During pageserver startup, we are attaching this tenant lazily in the background
Warmup(tokio::sync::SemaphorePermit<'a>),
// During pageserver startup, we are attaching this tenant as soon as we can,
// because a client tried to access it.
OnDemand,
// During normal operations after startup, we are attaching a tenant.
Normal,
}
// Before doing any I/O, wait for either or:
// - A client to attempt to access to this tenant (on-demand loading)
// - A permit to become available in the warmup semaphore (background warmup)
//
// Some-ness of init_order is how we know if we're attaching during startup or later
// in process lifetime.
let attach_type = if init_order.is_some() {
tokio::select!(
_ = tenant_clone.activate_now_sem.acquire() => {
tracing::info!("Activating tenant (on-demand)");
AttachType::OnDemand
},
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
match permit_result {
Ok(p) => {
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup(p)
}
Err(_) => {
// This is unexpected: the warmup semaphore should stay alive
// for the lifetime of init_order. Log a warning and proceed.
tracing::warn!("warmup_limit semaphore unexpectedly closed");
AttachType::Normal
}
}
}
_ = tenant_clone.cancel.cancelled() => {
// This is safe, but should be pretty rare: it is interesting if a tenant
// stayed in Activating for such a long time that shutdown found it in
// that state.
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
return Ok(());
},
)
} else {
AttachType::Normal
};
let preload_timer = TENANT.preload.start_timer();
let preload = match mode {
SpawnMode::Create => {None},
SpawnMode::Create => {
// Don't count the skipped preload into the histogram of preload durations
preload_timer.stop_and_discard();
None
},
SpawnMode::Normal => {
match &remote_storage {
Some(remote_storage) => Some(
@@ -650,7 +728,11 @@ impl Tenant {
tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
)
.await {
Ok(p) => p,
Ok(p) => {
preload_timer.observe_duration();
p
}
,
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
return Ok(());
@@ -712,15 +794,43 @@ impl Tenant {
}
}
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
let attach_timer = match mode {
SpawnMode::Create => None,
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
};
match tenant_clone.attach(preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
if let Some(t)= attach_timer {t.observe_duration();}
tenant_clone.activate(broker_client, None, &ctx);
}
Err(e) => {
if let Some(t)= attach_timer {t.observe_duration();}
make_broken(&tenant_clone, anyhow::anyhow!(e));
}
}
// If we are doing an opportunistic warmup attachment at startup, initialize
// logical size at the same time. This is better than starting a bunch of idle tenants
// with cold caches and then coming back later to initialize their logical sizes.
//
// It also prevents the warmup proccess competing with the concurrency limit on
// logical size calculations: if logical size calculation semaphore is saturated,
// then warmup will wait for that before proceeding to the next tenant.
if let AttachType::Warmup(_permit) = attach_type {
let mut futs = FuturesUnordered::new();
let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
for t in timelines {
futs.push(t.await_initial_logical_size())
}
tracing::info!("Waiting for initial logical sizes while warming up...");
while futs.next().await.is_some() {
}
tracing::info!("Warm-up complete");
}
Ok(())
}
.instrument({
@@ -1457,7 +1567,7 @@ impl Tenant {
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
/// minimum amount of keys required to get a writable timeline.
/// (Without it, `put` might fail due to `repartition` failing.)
pub async fn create_empty_timeline(
pub(crate) async fn create_empty_timeline(
&self,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
@@ -1469,10 +1579,7 @@ impl Tenant {
"Cannot create empty timelines on inactive tenant"
);
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
};
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
// make it valid, before calling finish_creation()
@@ -1549,7 +1656,7 @@ impl Tenant {
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
#[allow(clippy::too_many_arguments)]
pub async fn create_timeline(
pub(crate) async fn create_timeline(
&self,
new_timeline_id: TimelineId,
ancestor_timeline_id: Option<TimelineId>,
@@ -1570,26 +1677,51 @@ impl Tenant {
.enter()
.map_err(|_| CreateTimelineError::ShuttingDown)?;
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
debug!("timeline {new_timeline_id} already exists");
if let Some(remote_client) = existing.remote_client.as_ref() {
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
// and that no other creation attempts will be allowed in while we are working. The
// uninit_mark is a guard.
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
Ok(m) => m,
Err(TimelineExclusionError::AlreadyCreating) => {
// Creation is in progress, we cannot create it again, and we cannot
// check if this request matches the existing one, so caller must try
// again later.
return Err(CreateTimelineError::AlreadyCreating);
}
Err(TimelineExclusionError::Other(e)) => {
return Err(CreateTimelineError::Other(e));
}
Err(TimelineExclusionError::AlreadyExists(existing)) => {
debug!("timeline {new_timeline_id} already exists");
return Err(CreateTimelineError::AlreadyExists);
}
// Idempotency: creating the same timeline twice is not an error, unless
// the second creation has different parameters.
if existing.get_ancestor_timeline_id() != ancestor_timeline_id
|| existing.pg_version != pg_version
|| (ancestor_start_lsn.is_some()
&& ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
{
return Err(CreateTimelineError::Conflict);
}
if let Some(remote_client) = existing.remote_client.as_ref() {
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
}
return Ok(existing);
}
};
let loaded_timeline = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
@@ -1626,18 +1758,32 @@ impl Tenant {
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
}
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
.await?
self.branch_timeline(
&ancestor_timeline,
new_timeline_id,
ancestor_start_lsn,
uninit_mark,
ctx,
)
.await?
}
None => {
self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
.await?
self.bootstrap_timeline(
new_timeline_id,
pg_version,
load_existing_initdb,
uninit_mark,
ctx,
)
.await?
}
};
// At this point we have dropped our guard on [`Self::timelines_creating`], and
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
// not send a success to the caller until it is. The same applies to handling retries,
// see the handling of [`TimelineExclusionError::AlreadyExists`] above.
if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
// Wait for the upload of the 'index_part.json` file to finish, so that when we return
// Ok, the timeline is durable in remote storage.
let kind = ancestor_timeline_id
.map(|_| "branched")
.unwrap_or("bootstrapped");
@@ -1651,6 +1797,15 @@ impl Tenant {
Ok(loaded_timeline)
}
pub(crate) async fn delete_timeline(
self: Arc<Self>,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
Ok(())
}
/// perform one garbage collection iteration, removing old data files from disk.
/// this function is periodically called by gc task.
/// also it can be explicitly requested through page server api 'do_gc' command.
@@ -1812,7 +1967,7 @@ impl Tenant {
);
*current_state = TenantState::Active;
let elapsed = self.loading_started_at.elapsed();
let elapsed = self.constructed_at.elapsed();
let total_timelines = timelines_accessor.len();
// log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1827,7 +1982,7 @@ impl Tenant {
"activation attempt finished"
);
TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
TENANT.activation.observe(elapsed.as_secs_f64());
});
}
}
@@ -2082,18 +2237,41 @@ impl Tenant {
self.state.subscribe()
}
pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
/// The activate_now semaphore is initialized with zero units. As soon as
/// we add a unit, waiters will be able to acquire a unit and proceed.
pub(crate) fn activate_now(&self) {
self.activate_now_sem.add_permits(1);
}
pub(crate) async fn wait_to_become_active(
&self,
timeout: Duration,
) -> Result<(), GetActiveTenantError> {
let mut receiver = self.state.subscribe();
loop {
let current_state = receiver.borrow_and_update().clone();
match current_state {
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
// in these states, there's a chance that we can reach ::Active
receiver.changed().await.map_err(
|_e: tokio::sync::watch::error::RecvError|
// Tenant existed but was dropped: report it as non-existent
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
)?;
self.activate_now();
match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
Ok(r) => {
r.map_err(
|_e: tokio::sync::watch::error::RecvError|
// Tenant existed but was dropped: report it as non-existent
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
)?
}
Err(TimeoutCancellableError::Cancelled) => {
return Err(GetActiveTenantError::Cancelled);
}
Err(TimeoutCancellableError::Timeout) => {
return Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state: Some(self.current_state()),
wait_time: timeout,
});
}
}
}
TenantState::Active { .. } => {
return Ok(());
@@ -2114,6 +2292,14 @@ impl Tenant {
.attach_mode
.clone()
}
pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
pub(crate) fn get_generation(&self) -> Generation {
self.generation
}
}
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2252,6 +2438,18 @@ impl Tenant {
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_heatmap_period(&self) -> Option<Duration> {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let heatmap_period = tenant_conf
.heatmap_period
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
if heatmap_period.is_zero() {
None
} else {
Some(heatmap_period)
}
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
// Don't hold self.timelines.lock() during the notifies.
@@ -2398,9 +2596,10 @@ impl Tenant {
conf,
// using now here is good enough approximation to catch tenants with really long
// activation times.
loading_started_at: Instant::now(),
constructed_at: Instant::now(),
tenant_conf: Arc::new(RwLock::new(attached_conf)),
timelines: Mutex::new(HashMap::new()),
timelines_creating: Mutex::new(HashSet::new()),
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
@@ -2409,6 +2608,7 @@ impl Tenant {
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
activate_now_sem: tokio::sync::Semaphore::new(0),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
cancel: CancellationToken::default(),
gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
@@ -2792,8 +2992,9 @@ impl Tenant {
start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
let tl = self
.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
.await?;
tl.set_state(TimelineState::Active);
Ok(tl)
@@ -2807,9 +3008,10 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
timeline_uninit_mark: TimelineUninitMark<'_>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
.await
}
@@ -2818,13 +3020,14 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
timeline_uninit_mark: TimelineUninitMark<'_>,
_ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
let src_id = src_timeline.timeline_id;
// First acquire the GC lock so that another task cannot advance the GC
// cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
// creating the branch.
// We will validate our ancestor LSN in this function. Acquire the GC lock so that
// this check cannot race with GC, and the ancestor LSN is guaranteed to remain
// valid while we are creating the branch.
let _gc_cs = self.gc_cs.lock().await;
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2834,13 +3037,6 @@ impl Tenant {
lsn
});
// Create a placeholder for the new branch. This will error
// out if the new timeline ID is already in use.
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(dst_id, &timelines)?
};
// Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
// horizon on the source timeline
//
@@ -2932,21 +3128,38 @@ impl Tenant {
Ok(new_timeline)
}
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization completes, tar up the temp dir and upload it to S3.
///
/// The caller is responsible for activating the returned timeline.
pub(crate) async fn bootstrap_timeline(
/// For unit tests, make this visible so that other modules can directly create timelines
#[cfg(test)]
pub(crate) async fn bootstrap_timeline_test(
&self,
timeline_id: TimelineId,
pg_version: u32,
load_existing_initdb: Option<TimelineId>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(timeline_id, &timelines)?
};
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
self.bootstrap_timeline(
timeline_id,
pg_version,
load_existing_initdb,
uninit_mark,
ctx,
)
.await
}
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization completes, tar up the temp dir and upload it to S3.
///
/// The caller is responsible for activating the returned timeline.
async fn bootstrap_timeline(
&self,
timeline_id: TimelineId,
pg_version: u32,
load_existing_initdb: Option<TimelineId>,
timeline_uninit_mark: TimelineUninitMark<'_>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
@@ -2980,6 +3193,7 @@ impl Tenant {
storage,
&self.tenant_shard_id,
&existing_initdb_timeline_id,
&self.cancel,
)
.await
.context("download initdb tar")?;
@@ -3020,6 +3234,7 @@ impl Tenant {
&timeline_id,
pgdata_zstd.try_clone().await?,
tar_zst_size,
&self.cancel,
)
.await
},
@@ -3027,8 +3242,7 @@ impl Tenant {
3,
u32::MAX,
"persist_initdb_tar_zst",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await?;
@@ -3143,11 +3357,11 @@ impl Tenant {
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
/// uninit mark file.
async fn prepare_new_timeline(
&self,
async fn prepare_new_timeline<'a>(
&'a self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
uninit_mark: TimelineUninitMark,
uninit_mark: TimelineUninitMark<'a>,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
) -> anyhow::Result<UninitializedTimeline> {
@@ -3220,23 +3434,38 @@ impl Tenant {
fn create_timeline_uninit_mark(
&self,
timeline_id: TimelineId,
timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
) -> anyhow::Result<TimelineUninitMark> {
) -> Result<TimelineUninitMark, TimelineExclusionError> {
let tenant_shard_id = self.tenant_shard_id;
anyhow::ensure!(
timelines.get(&timeline_id).is_none(),
"Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory"
);
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
anyhow::ensure!(
!timeline_path.exists(),
"Timeline {timeline_path} already exists, cannot create its uninit mark file",
);
let uninit_mark_path = self
.conf
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
let uninit_mark = TimelineUninitMark::new(
self,
timeline_id,
uninit_mark_path.clone(),
timeline_path.clone(),
)?;
// At this stage, we have got exclusive access to in-memory state for this timeline ID
// for creation.
// A timeline directory should never exist on disk already:
// - a previous failed creation would have cleaned up after itself
// - a pageserver restart would clean up timeline directories that don't have valid remote state
//
// Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
// this error may indicate a bug in cleanup on failed creations.
if timeline_path.exists() {
return Err(TimelineExclusionError::Other(anyhow::anyhow!(
"Timeline directory already exists! This is a bug."
)));
}
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
// that during process runtime, colliding creations will be caught in-memory without getting
// as far as failing to write a file.
fs::OpenOptions::new()
.write(true)
.create_new(true)
@@ -3250,8 +3479,6 @@ impl Tenant {
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
})?;
let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path);
Ok(uninit_mark)
}
@@ -3694,6 +3921,7 @@ pub(crate) mod harness {
tenant_conf.evictions_low_residence_duration_metric_threshold,
),
gc_feedback: Some(tenant_conf.gc_feedback),
heatmap_period: Some(tenant_conf.heatmap_period),
}
}
}
@@ -4000,13 +4228,7 @@ mod tests {
.await
{
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(
e.to_string(),
format!(
"Timeline {}/{} already exists in pageserver's memory",
tenant.tenant_shard_id, TIMELINE_ID
)
),
Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
}
Ok(())

View File

@@ -334,6 +334,11 @@ pub struct TenantConf {
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
/// may be disabled if a Tenant will not have secondary locations: only secondary
/// locations will use the heatmap uploaded by attached locations.
pub heatmap_period: Duration,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -414,6 +419,11 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub heatmap_period: Option<Duration>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -482,6 +492,7 @@ impl TenantConfOpt {
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
}
}
}
@@ -519,6 +530,7 @@ impl Default for TenantConf {
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
heatmap_period: Duration::ZERO,
}
}
}

View File

@@ -71,6 +71,7 @@ async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -87,8 +88,7 @@ async fn create_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("mark_upload")?;
@@ -170,6 +170,7 @@ async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -179,8 +180,7 @@ async fn remove_tenant_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("remove_tenant_remote_delete_mark")?;
@@ -322,9 +322,15 @@ impl DeleteTenantFlow {
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
.await
.context("remote_mark")?
create_remote_delete_mark(
conf,
remote_storage,
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -524,8 +530,14 @@ impl DeleteTenantFlow {
.context("timelines dir not empty")?;
}
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
.await?;
remove_tenant_remote_delete_mark(
conf,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await?;
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(

View File

@@ -28,7 +28,7 @@ use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::TENANT_MANAGER as METRICS;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,7 +44,6 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::timeline::delete::DeleteTimelineFlow;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
@@ -430,6 +429,13 @@ pub async fn init_tenant_mgr(
let tenant_generations =
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
tracing::info!(
"Attaching {} tenants at startup, warming up {} at a time",
tenant_configs.len(),
conf.concurrent_tenant_warmup.initial_permits()
);
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
// Construct `Tenant` objects and start them running
for (tenant_shard_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -807,6 +813,12 @@ pub(crate) async fn set_new_tenant_config(
}
impl TenantManager {
/// Convenience function so that anyone with a TenantManager can get at the global configuration, without
/// having to pass it around everywhere as a separate object.
pub(crate) fn get_conf(&self) -> &'static PageServerConf {
self.conf
}
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub(crate) fn get_attached_tenant_shard(
@@ -842,17 +854,6 @@ impl TenantManager {
}
}
pub(crate) async fn delete_timeline(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
_ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
Ok(())
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(crate) async fn upsert_location(
&self,
@@ -1087,6 +1088,20 @@ impl TenantManager {
Ok(())
}
pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
let locked = self.tenants.read().unwrap();
match &*locked {
TenantsMap::Initializing => Vec::new(),
TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
.values()
.filter_map(|slot| {
slot.get_attached()
.and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
})
.collect(),
}
}
}
#[derive(Debug, thiserror::Error)]
@@ -1201,7 +1216,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
// Fast path: we don't need to do any async waiting.
return Ok(tenant.clone());
}
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
_ => {
tenant.activate_now();
(WaitFor::Tenant(tenant.clone()), tenant_shard_id)
}
}
}
Some(TenantSlot::Secondary) => {
@@ -1255,28 +1273,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
};
tracing::debug!("Waiting for tenant to enter active state...");
match timeout_cancellable(
deadline.duration_since(Instant::now()),
cancel,
tenant.wait_to_become_active(),
)
.await
{
Ok(Ok(())) => Ok(tenant),
Ok(Err(e)) => Err(e),
Err(TimeoutCancellableError::Timeout) => {
let latest_state = tenant.current_state();
if latest_state == TenantState::Active {
Ok(tenant)
} else {
Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state: Some(latest_state),
wait_time: timeout,
})
}
}
Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
}
tenant
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await?;
Ok(tenant)
}
pub(crate) async fn delete_tenant(

View File

@@ -180,7 +180,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod download;
pub(crate) mod download;
pub mod index;
mod upload;
@@ -196,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use std::ops::DerefMut;
@@ -316,6 +318,47 @@ pub struct RemoteTimelineClient {
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
cancel: CancellationToken,
}
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
///
/// This is a convenience for the various upload functions. In future
/// the anyhow::Error result should be replaced with a more structured type that
/// enables callers to avoid handling shutdown as an error.
async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
where
F: std::future::Future<Output = anyhow::Result<()>>,
{
match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => Err(e),
Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
}
}
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
async fn download_cancellable<F, R>(
cancel: &CancellationToken,
future: F,
) -> Result<R, DownloadError>
where
F: std::future::Future<Output = Result<R, DownloadError>>,
{
match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
Ok(Ok(r)) => Ok(r),
Ok(Err(e)) => Err(e),
Err(TimeoutCancellableError::Timeout) => {
Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
}
Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
}
}
impl RemoteTimelineClient {
@@ -351,6 +394,7 @@ impl RemoteTimelineClient {
&tenant_shard_id,
&timeline_id,
)),
cancel: CancellationToken::new(),
}
}
@@ -501,6 +545,7 @@ impl RemoteTimelineClient {
&self,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
cancel: &CancellationToken,
) -> anyhow::Result<u64> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -517,6 +562,7 @@ impl RemoteTimelineClient {
self.timeline_id,
layer_file_name,
layer_metadata,
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
@@ -971,6 +1017,7 @@ impl RemoteTimelineClient {
&self.timeline_id,
self.generation,
&index_part_with_deleted_at,
&self.cancel,
)
},
|_e| false,
@@ -980,8 +1027,7 @@ impl RemoteTimelineClient {
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await?;
@@ -1281,6 +1327,7 @@ impl RemoteTimelineClient {
path,
layer_metadata,
self.generation,
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
@@ -1307,6 +1354,7 @@ impl RemoteTimelineClient {
&self.timeline_id,
self.generation,
index_part,
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
@@ -1604,6 +1652,23 @@ impl RemoteTimelineClient {
}
}
}
pub(crate) fn get_layers_metadata(
&self,
layers: Vec<LayerFileName>,
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
let q = self.upload_queue.lock().unwrap();
let q = match &*q {
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", q.as_str())
}
UploadQueue::Initialized(inner) => inner,
};
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
Ok(decorated.collect())
}
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1659,6 +1724,13 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {
@@ -1804,6 +1876,7 @@ mod tests {
&self.harness.tenant_shard_id,
&TIMELINE_ID,
)),
cancel: CancellationToken::new(),
})
}

View File

@@ -5,7 +5,6 @@
use std::collections::HashSet;
use std::future::Future;
use std::time::Duration;
use anyhow::{anyhow, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -14,13 +13,17 @@ use tokio::fs::{self, File, OpenOptions};
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::warn;
use utils::timeout::timeout_cancellable;
use utils::{backoff, crashsafe};
use crate::config::PageServerConf;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::remote_timeline_client::{
download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
};
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::Generation;
use crate::virtual_file::on_fatal_io_error;
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
use utils::crashsafe::path_with_suffix_extension;
@@ -32,8 +35,6 @@ use super::{
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
};
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
///
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -46,6 +47,7 @@ pub async fn download_layer_file<'a>(
timeline_id: TimelineId,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
cancel: &CancellationToken,
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -73,14 +75,18 @@ pub async fn download_layer_file<'a>(
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let cancel_inner = cancel.clone();
let (mut destination_file, bytes_amount) = download_retry(
|| async {
let destination_file = tokio::fs::File::create(&temp_file_path)
.await
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
.map_err(DownloadError::Other)?;
let download = storage
.download(&remote_path)
// Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
// file: the write to local file doesn't start until after the request header is returned
// and we start draining the body stream below
let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
.await
.with_context(|| {
format!(
@@ -94,12 +100,33 @@ pub async fn download_layer_file<'a>(
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
let bytes_amount = tokio::time::timeout(
MAX_DOWNLOAD_DURATION,
// Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
// and we will unlink the temporary file if there is an error. This unlink is important because we
// are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
// we will imminiently try and write to again.
let bytes_amount: u64 = match timeout_cancellable(
DOWNLOAD_TIMEOUT,
&cancel_inner,
tokio::io::copy_buf(&mut reader, &mut destination_file),
)
.await
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
.with_context(|| {
format!(
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
)
})
.map_err(DownloadError::Other)?
{
Ok(b) => Ok(b),
Err(e) => {
// Remove incomplete files: on restart Timeline would do this anyway, but we must
// do it here for the retry case.
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
}
Err(e)
}
}
.with_context(|| {
format!(
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
@@ -112,6 +139,7 @@ pub async fn download_layer_file<'a>(
Ok((destination_file, bytes_amount))
},
&format!("download {remote_path:?}"),
cancel,
)
.await?;
@@ -188,8 +216,14 @@ pub async fn list_remote_timelines(
anyhow::bail!("storage-sync-list-remote-timelines");
});
let cancel_inner = cancel.clone();
let listing = download_retry_forever(
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|| {
download_cancellable(
&cancel_inner,
storage.list(Some(&remote_path), ListingMode::WithDelimiter),
)
},
&format!("list timelines for {tenant_shard_id}"),
cancel,
)
@@ -230,9 +264,13 @@ async fn do_download_index_part(
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let cancel_inner = cancel.clone();
let index_part_bytes = download_retry_forever(
|| async {
let index_part_download = storage.download(&remote_path).await?;
// Cancellation: if is safe to cancel this future because we're just downloading into
// a memory buffer, not touching local disk.
let index_part_download =
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
let mut index_part_bytes = Vec::new();
let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -347,10 +385,7 @@ pub(super) async fn download_index_part(
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"listing index_part files",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
unreachable!()
}),
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await
.map_err(DownloadError::Other)?;
@@ -389,6 +424,7 @@ pub(crate) async fn download_initdb_tar_zst(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
cancel: &CancellationToken,
) -> Result<(Utf8PathBuf, File), DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -406,6 +442,8 @@ pub(crate) async fn download_initdb_tar_zst(
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
let cancel_inner = cancel.clone();
let file = download_retry(
|| async {
let file = OpenOptions::new()
@@ -418,10 +456,14 @@ pub(crate) async fn download_initdb_tar_zst(
.with_context(|| format!("tempfile creation {temp_path}"))
.map_err(DownloadError::Other)?;
let download = storage.download(&remote_path).await?;
let download =
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
// not without thinking carefully about how to recover safely from cancelling a write to
// local storage (e.g. by writing into a temp file as we do in download_layer)
tokio::io::copy_buf(&mut download, &mut writer)
.await
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -437,6 +479,7 @@ pub(crate) async fn download_initdb_tar_zst(
Ok(file)
},
&format!("download {remote_path}"),
cancel,
)
.await
.map_err(|e| {
@@ -460,7 +503,11 @@ pub(crate) async fn download_initdb_tar_zst(
/// with backoff.
///
/// (See similar logic for uploads in `perform_upload_task`)
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
async fn download_retry<T, O, F>(
op: O,
description: &str,
cancel: &CancellationToken,
) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
@@ -471,10 +518,7 @@ where
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
description,
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
unreachable!()
}),
backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
)
.await
}

View File

@@ -4,14 +4,17 @@ use anyhow::{bail, Context};
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::TenantShardId;
use std::io::ErrorKind;
use std::io::{ErrorKind, SeekFrom};
use tokio::fs::{self, File};
use tokio::io::AsyncSeekExt;
use tokio_util::sync::CancellationToken;
use super::Generation;
use crate::{
config::PageServerConf,
tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
upload_cancellable,
},
};
use remote_storage::GenericRemoteStorage;
@@ -28,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
timeline_id: &TimelineId,
generation: Generation,
index_part: &'a IndexPart,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
tracing::trace!("uploading new index part");
@@ -43,14 +47,16 @@ pub(super) async fn upload_index_part<'a>(
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
storage
.upload_storage_object(
upload_cancellable(
cancel,
storage.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
index_part_size,
&remote_path,
)
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
),
)
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
}
/// Attempts to upload given layer files.
@@ -63,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>(
source_path: &'a Utf8Path,
known_metadata: &'a LayerFileMetadata,
generation: Generation,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
fail_point!("before-upload-layer", |_| {
bail!("failpoint before-upload-layer")
@@ -106,8 +113,7 @@ pub(super) async fn upload_timeline_layer<'a>(
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
storage
.upload(reader, fs_size, &storage_path, None)
upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
.await
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
@@ -119,16 +125,22 @@ pub(crate) async fn upload_initdb_dir(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
initdb_tar_zst: File,
mut initdb_tar_zst: File,
size: u64,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
tracing::trace!("uploading initdb dir");
// We might have read somewhat into the file already in the prior retry attempt
initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
storage
.upload_storage_object(file, size as usize, &remote_path)
.await
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
upload_cancellable(
cancel,
storage.upload_storage_object(file, size as usize, &remote_path),
)
.await
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
}

View File

@@ -0,0 +1,104 @@
pub mod heatmap;
mod heatmap_uploader;
use std::sync::Arc;
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use self::heatmap_uploader::heatmap_uploader_task;
use super::mgr::TenantManager;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::completion::Barrier;
enum UploadCommand {
Upload(TenantShardId),
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
}
struct CommandResponse {
result: anyhow::Result<()>,
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
}
impl SecondaryController {
async fn dispatch<T>(
&self,
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
payload: T,
) -> anyhow::Result<()> {
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
queue
.send(CommandRequest {
payload,
response_tx,
})
.await
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
let response = response_rx
.await
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
response.result
}
pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
.await
}
}
pub fn spawn_tasks(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
None,
None,
"heatmap uploads",
false,
async move {
heatmap_uploader_task(
tenant_manager,
remote_storage,
upload_req_rx,
background_jobs_can_start,
cancel,
)
.await
},
);
SecondaryController { upload_req_tx }
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController { upload_req_tx }
}

View File

@@ -0,0 +1,64 @@
use std::time::SystemTime;
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(super) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
pub(super) generation: Generation,
pub(super) timelines: Vec<HeatMapTimeline>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(super) timeline_id: TimelineId,
pub(super) layers: Vec<HeatMapLayer>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
pub(crate) fn new(
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {
name,
metadata,
access_time,
}
}
}
impl HeatMapTimeline {
pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
Self {
timeline_id,
layers,
}
}
}

View File

@@ -0,0 +1,582 @@
use std::{
collections::HashMap,
sync::{Arc, Weak},
time::{Duration, Instant},
};
use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
},
};
use md5;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{backoff, completion::Barrier};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
/// Period between heatmap uploader walking Tenants to look for work to do.
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
/// downward to match.
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
struct WriteInProgress {
barrier: Barrier,
}
struct UploadPending {
tenant: Arc<Tenant>,
last_digest: Option<md5::Digest>,
}
struct WriteComplete {
tenant_shard_id: TenantShardId,
completed_at: Instant,
digest: Option<md5::Digest>,
next_upload: Option<Instant>,
}
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
/// uploads disabled.
struct UploaderTenantState {
// This Weak only exists to enable culling idle instances of this type
// when the Tenant has been deallocated.
tenant: Weak<Tenant>,
/// Digest of the serialized heatmap that we last successfully uploaded
///
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
/// which is also an md5sum.
last_digest: Option<md5::Digest>,
/// When the last upload attempt completed (may have been successful or failed)
last_upload: Option<Instant>,
/// When should we next do an upload? None means never.
next_upload: Option<Instant>,
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantShardId, UploaderTenantState>,
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
/// limits permit it.
tenants_pending: std::collections::VecDeque<UploadPending>,
/// Tenants for which a task in `tasks` has been spawned.
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
tasks: JoinSet<()>,
/// Channel for our child tasks to send results to: we use a channel for results rather than
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
/// behavior.
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
concurrent_uploads: usize,
scheduling_interval: Duration,
}
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
/// tenants that require an upload, or handles any commands that have been sent into
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
/// spawn.
///
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
/// all tenants that require an upload, and in between scheduling iterations we will
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
///
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
let mut uploader = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_pending: std::collections::VecDeque::new(),
tenants_uploading: HashMap::new(),
task_result_tx: result_tx,
task_result_rx: result_rx,
concurrent_uploads,
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
uploader.schedule_iteration().await?;
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(uploader.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s), running immediately!",
uploader.scheduling_interval.as_secs_f64()
);
Instant::now()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
tracing::info!("Heatmap uploader joining tasks");
while let Some(_r) = uploader.tasks.join_next().await {};
tracing::info!("Heatmap uploader terminating");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("heatmap_uploader_task: woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap uploader terminating");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
uploader.handle_command(payload, response_tx);
},
_ = uploader.process_next_completion() => {
if !cancel.is_cancelled() {
uploader.spawn_pending();
}
}
}
}
}
Ok(())
}
impl HeatmapUploader {
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.tenants_pending.clear();
// Used a fixed 'now' through the following loop, for efficiency and fairness.
let now = Instant::now();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
const YIELD_ITERATIONS: usize = 1000;
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
for (i, tenant) in tenants.into_iter().enumerate() {
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Skip tenants that already have a write in flight
if self
.tenants_uploading
.contains_key(tenant.get_tenant_shard_id())
{
continue;
}
self.maybe_schedule_upload(&now, tenant);
if i + 1 % YIELD_ITERATIONS == 0 {
tokio::task::yield_now().await;
}
}
// Spawn tasks for as many of our pending tenants as we can.
self.spawn_pending();
Ok(())
}
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) {
match self.task_result_rx.recv().await {
Some(r) => {
self.on_completion(r);
}
None => {
unreachable!("Result sender is stored on Self");
}
}
}
/// The 'maybe' refers to the tenant's state: whether it is configured
/// for heatmap uploads at all, and whether sufficient time has passed
/// since the last upload.
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
if period < self.scheduling_interval {
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
}
}
}
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(Instant::now()),
last_digest: None,
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
return;
}
let last_digest = state.last_digest;
self.tenants_pending.push_back(UploadPending {
tenant,
last_digest,
})
}
fn spawn_pending(&mut self) {
while !self.tenants_pending.is_empty()
&& self.tenants_uploading.len() < self.concurrent_uploads
{
// unwrap: loop condition includes !is_empty()
let pending = self.tenants_pending.pop_front().unwrap();
self.spawn_upload(pending.tenant, pending.last_digest);
}
}
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
let remote_storage = self.remote_storage.clone();
let tenant_shard_id = *tenant.get_tenant_shard_id();
let (completion, barrier) = utils::completion::channel();
let result_tx = self.task_result_tx.clone();
self.tasks.spawn(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
let started_at = Instant::now();
let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap.inc();
Some(digest)
}
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
Err(UploadHeatmapError::Upload(e)) => {
tracing::warn!(
"Failed to upload heatmap for tenant {}: {e:#}",
tenant.get_tenant_shard_id(),
);
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap_errors.inc();
last_digest
}
Err(UploadHeatmapError::Cancelled) => {
tracing::info!("Cancelled heatmap upload, shutting down");
last_digest
}
};
let now = Instant::now();
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period));
result_tx
.send(WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
digest,
next_upload,
})
.ok();
});
self.tenants_uploading
.insert(tenant_shard_id, WriteInProgress { barrier });
}
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
fn on_completion(&mut self, completion: WriteComplete) {
tracing::debug!("Heatmap upload completed");
let WriteComplete {
tenant_shard_id,
completed_at,
digest,
next_upload,
} = completion;
self.tenants_uploading.remove(&tenant_shard_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(tenant_shard_id) {
Entry::Vacant(_) => {
// Tenant state was dropped, nothing to update.
}
Entry::Occupied(mut entry) => {
entry.get_mut().last_upload = Some(completed_at);
entry.get_mut().last_digest = digest;
entry.get_mut().next_upload = next_upload
}
}
}
fn handle_command(
&mut self,
command: UploadCommand,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
match command {
UploadCommand::Upload(tenant_shard_id) => {
// If an upload was ongoing for this tenant, let it finish first.
let barrier = if let Some(writing_state) =
self.tenants_uploading.get(&tenant_shard_id)
{
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap write to complete");
writing_state.barrier.clone()
} else {
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = match self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, true)
{
Ok(t) => t,
Err(e) => {
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse {
result: Err(e.into()),
}));
return;
}
};
self.spawn_upload(tenant, None);
let writing_state = self
.tenants_uploading
.get(&tenant_shard_id)
.expect("We just inserted this");
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap upload to complete");
writing_state.barrier.clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Heatmap upload complete");
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse { result: Ok(()) }))
});
}
}
}
}
enum UploadHeatmapOutcome {
/// We successfully wrote to remote storage, with this digest.
Uploaded(md5::Digest),
/// We did not upload because the heatmap digest was unchanged since the last upload
NoChange,
/// We skipped the upload for some reason, such as tenant/timeline not ready
Skipped,
}
#[derive(thiserror::Error, Debug)]
enum UploadHeatmapError {
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Upload(#[from] anyhow::Error),
}
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,
last_digest: Option<md5::Digest>,
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
debug_assert_current_span_has_tenant_id();
let generation = tenant.get_generation();
if generation.is_none() {
// We do not expect this: generations were implemented before heatmap uploads. However,
// handle it so that we don't have to make the generation in the heatmap an Option<>
// (Generation::none is not serializable)
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
return Ok(UploadHeatmapOutcome::Skipped);
}
let mut heatmap = HeatMapTenant {
timelines: Vec::new(),
generation,
};
let timelines = tenant.timelines.lock().unwrap().clone();
let tenant_cancel = tenant.cancel.clone();
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
// in remote storage.
let _guard = match tenant.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
return Err(UploadHeatmapError::Cancelled);
}
};
for (timeline_id, timeline) in timelines {
let heatmap_timeline = timeline.generate_heatmap().await;
match heatmap_timeline {
None => {
tracing::debug!(
"Skipping heatmap upload because timeline {timeline_id} is not ready"
);
return Ok(UploadHeatmapOutcome::Skipped);
}
Some(heatmap_timeline) => {
heatmap.timelines.push(heatmap_timeline);
}
}
}
// Serialize the heatmap
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
let size = bytes.len();
// Drop out early if nothing changed since our last upload
let digest = md5::compute(&bytes);
if Some(digest) == last_digest {
return Ok(UploadHeatmapOutcome::NoChange);
}
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
// Write the heatmap.
tracing::debug!("Uploading {size} byte heatmap to {path}");
if let Err(e) = backoff::retry(
|| async {
let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
bytes.clone(),
))));
remote_storage
.upload_storage_object(bytes, size, &path)
.await
},
|_| false,
3,
u32::MAX,
"Uploading heatmap",
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
)
.await
{
if tenant_cancel.is_cancelled() {
return Err(UploadHeatmapError::Cancelled);
} else {
return Err(e.into());
}
}
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(UploadHeatmapOutcome::Uploaded(digest))
}

View File

@@ -457,6 +457,8 @@ struct LayerInner {
/// For loaded layers, this may be some other value if the tenant has undergone
/// a shard split since the layer was originally written.
shard: ShardIndex,
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
}
impl std::fmt::Display for LayerInner {
@@ -587,6 +589,7 @@ impl LayerInner {
consecutive_failures: AtomicUsize::new(0),
generation,
shard,
last_evicted_at: std::sync::Mutex::default(),
}
}
@@ -722,6 +725,14 @@ impl LayerInner {
permit
};
let since_last_eviction =
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
if let Some(since_last_eviction) = since_last_eviction {
// FIXME: this will not always be recorded correctly until #6028 (the no
// download needed branch above)
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
}
let res = Arc::new(DownloadedLayer {
owner: Arc::downgrade(self),
kind: tokio::sync::OnceCell::default(),
@@ -851,6 +862,7 @@ impl LayerInner {
let result = client.download_layer_file(
&this.desc.filename(),
&this.metadata(),
&crate::task_mgr::shutdown_token()
)
.await;
@@ -1117,6 +1129,8 @@ impl LayerInner {
// we are still holding the permit, so no new spawn_download_and_wait can happen
drop(self.status.send(Status::Evicted));
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
res
}
@@ -1421,6 +1435,7 @@ pub(crate) struct LayerImplMetrics {
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
redownload_after: metrics::Histogram,
}
impl Default for LayerImplMetrics {
@@ -1496,6 +1511,26 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let redownload_after = {
let minute = 60.0;
let hour = 60.0 * minute;
metrics::register_histogram!(
"pageserver_layer_redownloaded_after",
"Time between evicting and re-downloading.",
vec![
10.0,
30.0,
minute,
5.0 * minute,
15.0 * minute,
30.0 * minute,
hour,
12.0 * hour,
]
)
.unwrap()
};
Self {
started_evictions,
completed_evictions,
@@ -1507,6 +1542,7 @@ impl Default for LayerImplMetrics {
rare_counters,
inits_cancelled,
redownload_after,
}
}
}
@@ -1574,6 +1610,10 @@ impl LayerImplMetrics {
fn inc_init_cancelled(&self) {
self.inits_cancelled.inc()
}
fn record_redownloaded_after(&self, duration: std::time::Duration) {
self.redownload_after.observe(duration.as_secs_f64())
}
}
#[derive(enum_map::Enum)]

View File

@@ -54,31 +54,18 @@ impl BackgroundLoopKind {
}
}
pub(crate) enum RateLimitError {
Cancelled,
}
pub(crate) async fn concurrent_background_tasks_rate_limit(
/// Cancellation safe.
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
cancel: &CancellationToken,
) -> Result<impl Drop, RateLimitError> {
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
) -> impl Drop {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
.with_label_values(&[loop_kind.as_static_str()])
.inc();
scopeguard::defer!(
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
);
tokio::select! {
permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
match permit {
Ok(permit) => Ok(permit),
Err(_closed) => unreachable!("we never close the semaphore"),
}
},
_ = cancel.cancelled() => {
Err(RateLimitError::Cancelled)
}
.guard();
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),
}
}

View File

@@ -29,7 +29,7 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{id::TenantTimelineId, sync::gate::Gate};
use utils::sync::gate::Gate;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
@@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{
LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
ValueReconstructState,
};
use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
@@ -98,8 +98,9 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -377,9 +378,6 @@ pub enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
/// The operation would require downloading a layer that is missing locally.
NeedsDownload(TenantTimelineId, LayerFileName),
/// The operation was cancelled
Cancelled,
@@ -408,14 +406,6 @@ impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -429,14 +419,6 @@ impl std::fmt::Display for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -464,6 +446,12 @@ pub(crate) enum CompactFlags {
ForceRepartition,
}
impl std::fmt::Debug for Timeline {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Timeline<{}>", self.timeline_id)
}
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -727,19 +715,27 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
let _g = self.compaction_lock.lock().await;
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
let prepare = async move {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Compaction,
ctx,
)
.await;
(guard, permit)
};
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
BackgroundLoopKind::Compaction,
ctx,
cancel,
)
.await
{
Ok(permit) => permit,
Err(RateLimitError::Cancelled) => return Ok(()),
let (_guard, _permit) = tokio::select! {
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(()),
_ = cancel.cancelled() => return Ok(()),
};
let last_record_lsn = self.get_last_record_lsn();
@@ -1118,8 +1114,9 @@ impl Timeline {
Ok(Some(true))
}
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
/// Evict just one layer.
///
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
@@ -1130,109 +1127,17 @@ impl Timeline {
return Ok(None);
};
let Some(local_layer) = local_layer.keep_resident().await? else {
return Ok(Some(false));
};
let local_layer: Layer = local_layer.into();
let remote_client = self
let rtc = self
.remote_client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
let results = self
.evict_layer_batch(remote_client, &[local_layer])
.await?;
assert_eq!(results.len(), 1);
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
match result {
None => anyhow::bail!("task_mgr shutdown requested"),
Some(Ok(())) => Ok(Some(true)),
Some(Err(e)) => Err(anyhow::Error::new(e)),
match local_layer.evict_and_wait(rtc).await {
Ok(()) => Ok(Some(true)),
Err(EvictionError::NotFound) => Ok(Some(false)),
Err(EvictionError::Downloaded) => Ok(Some(false)),
}
}
/// Evict a batch of layers.
pub(crate) async fn evict_layers(
&self,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
let _gate = self
.gate
.enter()
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
let remote_client = self
.remote_client
.as_ref()
.context("timeline must have RemoteTimelineClient")?;
self.evict_layer_batch(remote_client, layers_to_evict).await
}
/// Evict multiple layers at once, continuing through errors.
///
/// The `remote_client` should be this timeline's `self.remote_client`.
/// We make the caller provide it so that they are responsible for handling the case
/// where someone wants to evict the layer but no remote storage is configured.
///
/// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
/// If `Err()` is returned, no eviction was attempted.
/// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
/// Meaning of each `result[i]`:
/// - `Some(Err(...))` if layer replacement failed for some reason
/// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
/// - `Some(Ok(()))` if everything went well.
/// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
async fn evict_layer_batch(
&self,
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
{
// to avoid racing with detach and delete_timeline
let state = self.current_state();
anyhow::ensure!(
state == TimelineState::Active,
"timeline is not active but {state:?}"
);
}
let mut results = Vec::with_capacity(layers_to_evict.len());
for _ in 0..layers_to_evict.len() {
results.push(None);
}
let mut js = tokio::task::JoinSet::new();
for (i, l) in layers_to_evict.iter().enumerate() {
js.spawn({
let l = l.to_owned();
let remote_client = remote_client.clone();
async move { (i, l.evict_and_wait(&remote_client).await) }
});
}
let join = async {
while let Some(next) = js.join_next().await {
match next {
Ok((i, res)) => results[i] = Some(res),
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
};
tokio::select! {
_ = self.cancel.cancelled() => {},
_ = join => {}
}
assert_eq!(results.len(), layers_to_evict.len());
Ok(results)
}
}
/// Number of times we will compute partition within a checkpoint distance.
@@ -1829,6 +1734,7 @@ impl Timeline {
self.current_logical_size.current_size().accuracy(),
logical_size::Accuracy::Exact,
);
self.current_logical_size.initialized.add_permits(1);
return;
};
@@ -1874,6 +1780,11 @@ impl Timeline {
cancel: CancellationToken,
background_ctx: RequestContext,
) {
scopeguard::defer! {
// Irrespective of the outcome of this operation, we should unblock anyone waiting for it.
self.current_logical_size.initialized.add_permits(1);
}
enum BackgroundCalculationError {
Cancelled,
Other(anyhow::Error),
@@ -1885,22 +1796,22 @@ impl Timeline {
let skip_concurrency_limiter = &skip_concurrency_limiter;
async move {
let cancel = task_mgr::shutdown_token();
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
background_ctx,
&cancel,
);
use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, circumstances) = tokio::select! {
res = wait_for_permit => {
match res {
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
Err(RateLimitError::Cancelled) => {
return Err(BackgroundCalculationError::Cancelled);
}
}
permit = wait_for_permit => {
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
}
_ = self_ref.cancel.cancelled() => {
return Err(BackgroundCalculationError::Cancelled);
}
_ = cancel.cancelled() => {
return Err(BackgroundCalculationError::Cancelled);
},
() = skip_concurrency_limiter.cancelled() => {
// Some action that is part of a end user interaction requested logical size
// => break out of the rate limit
@@ -2165,6 +2076,55 @@ impl Timeline {
None
}
/// The timeline heatmap is a hint to secondary locations from the primary location,
/// indicating which layers are currently on-disk on the primary.
///
/// None is returned if the Timeline is in a state where uploading a heatmap
/// doesn't make sense, such as shutting down or initializing. The caller
/// should treat this as a cue to simply skip doing any heatmap uploading
/// for this timeline.
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
let remote_client = match &self.remote_client {
Some(c) => c,
None => return None,
};
let layer_file_names = eviction_info
.resident_layers
.iter()
.map(|l| l.layer.layer_desc().filename())
.collect::<Vec<_>>();
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
Ok(d) => d,
Err(_) => {
// Getting metadata only fails on Timeline in bad state.
return None;
}
};
let heatmap_layers = std::iter::zip(
eviction_info.resident_layers.into_iter(),
decorated.into_iter(),
)
.filter_map(|(layer, remote_info)| {
remote_info.map(|remote_info| {
HeatMapLayer::new(
layer.layer.layer_desc().filename(),
IndexLayerMetadata::from(remote_info),
layer.last_activity_ts,
)
})
});
Some(HeatMapTimeline::new(
self.timeline_id,
heatmap_layers.collect(),
))
}
}
type TraversalId = String;
@@ -3150,6 +3110,32 @@ impl Timeline {
Ok(image_layers)
}
/// Wait until the background initial logical size calculation is complete, or
/// this Timeline is shut down. Calling this function will cause the initial
/// logical size calculation to skip waiting for the background jobs barrier.
pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
if let Some(await_bg_cancel) = self
.current_logical_size
.cancel_wait_for_background_loop_concurrency_limit_semaphore
.get()
{
await_bg_cancel.cancel();
} else {
// We should not wait if we were not able to explicitly instruct
// the logical size cancellation to skip the concurrency limit semaphore.
// TODO: this is an unexpected case. We should restructure so that it
// can't happen.
tracing::info!(
"await_initial_logical_size: can't get semaphore cancel token, skipping"
);
}
tokio::select!(
_ = self.current_logical_size.initialized.acquire() => {},
_ = self.cancel.cancelled() => {}
)
}
}
#[derive(Default)]
@@ -3906,7 +3892,14 @@ impl Timeline {
/// within a layer file. We can only remove the whole file if it's fully
/// obsolete.
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
let _g = self.gc_lock.lock().await;
// this is most likely the background tasks, but it might be the spawned task from
// immediate_gc
let cancel = crate::task_mgr::shutdown_token();
let _g = tokio::select! {
guard = self.gc_lock.lock() => guard,
_ = self.cancel.cancelled() => return Ok(GcResult::default()),
_ = cancel.cancelled() => return Ok(GcResult::default()),
};
let timer = self.metrics.garbage_collect_histo.start_timer();
fail_point!("before-timeline-gc");
@@ -4605,7 +4598,7 @@ mod tests {
.await
.unwrap();
let rc = timeline
let rtc = timeline
.remote_client
.clone()
.expect("just configured this");
@@ -4618,16 +4611,12 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let batch = [layer];
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let first = async { layer.evict_and_wait(&rtc).await };
let second = async { layer.evict_and_wait(&rtc).await };
let (first, second) = tokio::join!(first, second);
let (first, second) = (only_one(first), only_one(second));
let res = batch[0].keep_resident().await;
let res = layer.keep_resident().await;
assert!(matches!(res, Ok(None)), "{res:?}");
match (first, second) {
@@ -4648,14 +4637,6 @@ mod tests {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
assert_eq!(1, input.len());
input
.pop()
.expect("length just checked")
.expect("no cancellation")
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let desc = layers

View File

@@ -30,7 +30,7 @@ use crate::{
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
tasks::{BackgroundLoopKind, RateLimitError},
tasks::BackgroundLoopKind,
timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
},
@@ -158,15 +158,15 @@ impl Timeline {
) -> ControlFlow<()> {
let now = SystemTime::now();
let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
cancel,
)
.await
{
Ok(permit) => permit,
Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
);
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
// If we evict layers but keep cached values derived from those layers, then
@@ -212,11 +212,21 @@ impl Timeline {
// Gather layers for eviction.
// NB: all the checks can be invalidated as soon as we release the layer map lock.
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let candidates: Vec<_> = {
let remote_client = match self.remote_client.as_ref() {
Some(c) => c,
None => {
error!("no remote storage configured, cannot evict layers");
return ControlFlow::Continue(());
}
};
let mut js = tokio::task::JoinSet::new();
{
let guard = self.layers.read().await;
let layers = guard.layer_map();
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = guard.get_from_desc(&hist_layer);
@@ -262,54 +272,49 @@ impl Timeline {
continue;
}
};
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
candidates.push(guard.drop_eviction_guard())
let remote_client = remote_client.clone();
// this could cause a lot of allocations in some cases
js.spawn(async move { layer.evict_and_wait(&remote_client).await });
stats.candidates += 1;
}
}
candidates
};
stats.candidates = candidates.len();
let remote_client = match self.remote_client.as_ref() {
None => {
error!(
num_candidates = candidates.len(),
"no remote storage configured, cannot evict layers"
);
return ControlFlow::Continue(());
}
Some(c) => c,
};
let results = match self.evict_layer_batch(remote_client, &candidates).await {
Err(pre_err) => {
stats.errors += candidates.len();
error!("could not do any evictions: {pre_err:#}");
return ControlFlow::Continue(());
let join_all = async move {
while let Some(next) = js.join_next().await {
match next {
Ok(Ok(())) => stats.evicted += 1,
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
/* already logged */
stats.errors += 1;
}
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
Ok(results) => results,
stats
};
assert_eq!(results.len(), candidates.len());
for result in results {
match result {
None => {
stats.skipped_for_shutdown += 1;
}
Some(Ok(())) => {
stats.evicted += 1;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
tokio::select! {
stats = join_all => {
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
}
}
_ = cancel.cancelled() => {
// just drop the joinset to "abort"
}
}
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
}
ControlFlow::Continue(())
}

View File

@@ -34,6 +34,9 @@ pub(super) struct LogicalSize {
pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
OnceCell<CancellationToken>,
/// Once the initial logical size is initialized, this is notified.
pub(crate) initialized: tokio::sync::Semaphore,
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
pub initial_part_end: Option<Lsn>,
@@ -125,6 +128,7 @@ impl LogicalSize {
initial_part_end: None,
size_added_after_initial: AtomicI64::new(0),
did_return_approximate_to_walreceiver: AtomicBool::new(false),
initialized: tokio::sync::Semaphore::new(0),
}
}
@@ -135,6 +139,7 @@ impl LogicalSize {
initial_part_end: Some(compute_to),
size_added_after_initial: AtomicI64::new(0),
did_return_approximate_to_walreceiver: AtomicBool::new(false),
initialized: tokio::sync::Semaphore::new(0),
}
}

View File

@@ -19,14 +19,14 @@ use super::Timeline;
pub struct UninitializedTimeline<'t> {
pub(crate) owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
}
impl<'t> UninitializedTimeline<'t> {
pub(crate) fn new(
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
) -> Self {
Self {
owning_tenant,
@@ -169,18 +169,55 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
///
/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
#[must_use]
pub(crate) struct TimelineUninitMark {
pub(crate) struct TimelineUninitMark<'t> {
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
uninit_mark_deleted: bool,
uninit_mark_path: Utf8PathBuf,
pub(crate) timeline_path: Utf8PathBuf,
}
impl TimelineUninitMark {
pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
Self {
uninit_mark_deleted: false,
uninit_mark_path,
timeline_path,
/// Errors when acquiring exclusive access to a timeline ID for creation
#[derive(thiserror::Error, Debug)]
pub(crate) enum TimelineExclusionError {
#[error("Already exists")]
AlreadyExists(Arc<Timeline>),
#[error("Already creating")]
AlreadyCreating,
// e.g. I/O errors, or some failure deep in postgres initdb
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl<'t> TimelineUninitMark<'t> {
pub(crate) fn new(
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
uninit_mark_path: Utf8PathBuf,
timeline_path: Utf8PathBuf,
) -> Result<Self, TimelineExclusionError> {
// Lock order: this is the only place we take both locks. During drop() we only
// lock creating_timelines
let timelines = owning_tenant.timelines.lock().unwrap();
let mut creating_timelines: std::sync::MutexGuard<
'_,
std::collections::HashSet<TimelineId>,
> = owning_tenant.timelines_creating.lock().unwrap();
if let Some(existing) = timelines.get(&timeline_id) {
Err(TimelineExclusionError::AlreadyExists(existing.clone()))
} else if creating_timelines.contains(&timeline_id) {
Err(TimelineExclusionError::AlreadyCreating)
} else {
creating_timelines.insert(timeline_id);
Ok(Self {
owning_tenant,
timeline_id,
uninit_mark_deleted: false,
uninit_mark_path,
timeline_path,
})
}
}
@@ -207,7 +244,7 @@ impl TimelineUninitMark {
}
}
impl Drop for TimelineUninitMark {
impl Drop for TimelineUninitMark<'_> {
fn drop(&mut self) {
if !self.uninit_mark_deleted {
if self.timeline_path.exists() {
@@ -226,5 +263,11 @@ impl Drop for TimelineUninitMark {
}
}
}
self.owning_tenant
.timelines_creating
.lock()
.unwrap()
.remove(&self.timeline_id);
}
}

View File

@@ -2191,7 +2191,7 @@ mod tests {
.load()
.await;
let tline = tenant
.bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
.bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
.await
.unwrap();

View File

@@ -7,6 +7,8 @@ use proxy::console;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::usage_metrics;
@@ -14,6 +16,7 @@ use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use std::pin::pin;
use std::sync::Arc;
use std::{borrow::Cow, net::SocketAddr};
use tokio::net::TcpListener;
use tokio::task::JoinSet;
@@ -113,8 +116,11 @@ struct ProxyCliArgs {
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
rate_limiter_timeout: tokio::time::Duration,
/// Endpoint rate limiter max number of requests per second.
#[clap(long, default_value_t = 300)]
endpoint_rps_limit: u32,
///
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
/// Can be given multiple times for different bucket sizes.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
endpoint_rps_limit: Vec<RateBucketInfo>,
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
#[clap(long, default_value_t = 100)]
initial_limit: usize,
@@ -157,6 +163,8 @@ async fn main() -> anyhow::Result<()> {
let proxy_listener = TcpListener::bind(proxy_address).await?;
let cancellation_token = CancellationToken::new();
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
// client facing tasks. these will exit on error or on cancellation
// cancellation returns Ok(())
let mut client_tasks = JoinSet::new();
@@ -164,6 +172,7 @@ async fn main() -> anyhow::Result<()> {
config,
proxy_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
// TODO: rename the argument to something like serverless.
@@ -177,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
config,
serverless_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
}
@@ -311,6 +321,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,
};
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let config = Box::leak(Box::new(ProxyConfig {
tls_config,
auth_backend,
@@ -320,8 +334,35 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
authentication_config,
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit: args.endpoint_rps_limit,
endpoint_rps_limit,
}));
Ok(config)
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use clap::Parser;
use proxy::rate_limiter::RateBucketInfo;
#[test]
fn parse_endpoint_rps_limit() {
let config = super::ProxyCliArgs::parse_from([
"proxy",
"--endpoint-rps-limit",
"100@1s",
"--endpoint-rps-limit",
"20@30s",
]);
assert_eq!(
config.endpoint_rps_limit,
vec![
RateBucketInfo::new(100, Duration::from_secs(1)),
RateBucketInfo::new(20, Duration::from_secs(30)),
]
);
}
}

View File

@@ -1,9 +1,13 @@
use crate::{
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
error::UserFacingError, proxy::neon_option,
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::UserFacingError,
proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use metrics::IntCounterPairGuard;
use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
@@ -223,6 +227,8 @@ pub struct PostgresConnection {
pub params: std::collections::HashMap<String, String>,
/// Query cancellation token.
pub cancel_closure: CancelClosure,
_guage: IntCounterPairGuard,
}
impl ConnCfg {
@@ -231,6 +237,7 @@ impl ConnCfg {
&self,
allow_self_signed_compute: bool,
timeout: Duration,
proto: &'static str,
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -264,6 +271,7 @@ impl ConnCfg {
stream,
params,
cancel_closure,
_guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
};
Ok(connection)

View File

@@ -1,4 +1,4 @@
use crate::auth;
use crate::{auth, rate_limiter::RateBucketInfo};
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
@@ -20,7 +20,7 @@ pub struct ProxyConfig {
pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
pub endpoint_rps_limit: u32,
pub endpoint_rps_limit: Vec<RateBucketInfo>,
}
#[derive(Debug)]

View File

@@ -17,7 +17,10 @@ use anyhow::{bail, Context};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
use metrics::{
exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
IntCounterPairVec, IntCounterVec,
};
use once_cell::sync::{Lazy, OnceCell};
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use prometheus::{
@@ -44,17 +47,10 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_opened_db_connections_total",
"Number of opened connections to a database.",
&["protocol"],
)
.unwrap()
});
pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_db_connections_total",
"Number of closed connections to a database.",
&["protocol"],
@@ -62,17 +58,10 @@ pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(||
.unwrap()
});
pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_opened_client_connections_total",
"Number of opened connections from a client.",
&["protocol"],
)
.unwrap()
});
pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_client_connections_total",
"Number of closed connections from a client.",
&["protocol"],
@@ -80,17 +69,10 @@ pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new
.unwrap()
});
pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_accepted_connections_total",
"Number of client connections accepted.",
&["protocol"],
)
.unwrap()
});
pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_connections_total",
"Number of client connections closed.",
&["protocol"],
@@ -297,6 +279,7 @@ pub async fn task_main(
config: &'static ProxyConfig,
listener: tokio::net::TcpListener,
cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("proxy has shut down");
@@ -308,7 +291,6 @@ pub async fn task_main(
let connections = tokio_util::task::task_tracker::TaskTracker::new();
let cancel_map = Arc::new(CancelMap::default());
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
while let Some(accept_result) =
run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -428,16 +410,12 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
);
let proto = mode.protocol_label();
NUM_CLIENT_CONNECTION_OPENED_COUNTER
let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&[proto])
.inc();
NUM_CONNECTIONS_ACCEPTED_COUNTER
.guard();
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&[proto])
.inc();
scopeguard::defer! {
NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
}
.guard();
let tls = config.tls_config.as_ref();
@@ -584,12 +562,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
proto: &'static str,
) -> Result<PostgresConnection, compute::ConnectionError> {
let allow_self_signed_compute = node_info.allow_self_signed_compute;
node_info
.config
.connect(allow_self_signed_compute, timeout)
.connect(allow_self_signed_compute, timeout, proto)
.await
}
@@ -610,6 +589,7 @@ pub trait ConnectMechanism {
pub struct TcpMechanism<'a> {
/// KV-dictionary with PostgreSQL connection params.
pub params: &'a StartupMessageParams,
pub proto: &'static str,
}
#[async_trait]
@@ -623,7 +603,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<PostgresConnection, Self::Error> {
connect_to_compute_once(node_info, timeout).await
connect_to_compute_once(node_info, timeout, self.proto).await
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -1028,7 +1008,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
let aux = node_info.aux.clone();
let mut node = connect_to_compute(
&TcpMechanism { params },
&TcpMechanism { params, proto },
node_info,
&extra,
&creds,
@@ -1037,13 +1017,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
.or_else(|e| stream.throw_error(e))
.await?;
NUM_DB_CONNECTIONS_OPENED_COUNTER
.with_label_values(&[proto])
.inc();
scopeguard::defer! {
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
}
prepare_client_connection(&node, session, &mut stream).await?;
// Before proxy passing, forward to compute whatever data is left in the
// PqStream input buffer. Normally there is none, but our serverless npm

View File

@@ -3,5 +3,5 @@ mod limit_algorithm;
mod limiter;
pub use aimd::Aimd;
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
pub use limiter::EndpointRateLimiter;
pub use limiter::Limiter;
pub use limiter::{EndpointRateLimiter, RateBucketInfo};

View File

@@ -33,39 +33,6 @@ impl Aimd {
min_utilisation_threshold: config.aimd_min_utilisation_threshold,
}
}
pub fn decrease_factor(self, factor: f32) -> Self {
assert!((0.5..1.0).contains(&factor));
Self {
decrease_factor: factor,
..self
}
}
pub fn increase_by(self, increase: usize) -> Self {
assert!(increase > 0);
Self {
increase_by: increase,
..self
}
}
pub fn with_max_limit(self, max: usize) -> Self {
assert!(max > 0);
Self {
max_limit: max,
..self
}
}
/// A threshold below which the limit won't be increased. 0.5 = 50%.
pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
assert!(min_util > 0. && min_util < 1.);
Self {
min_utilisation_threshold: min_util,
..self
}
}
}
#[async_trait]

View File

@@ -1,16 +1,19 @@
use std::{
collections::hash_map::RandomState,
hash::BuildHasher,
sync::{
atomic::{AtomicUsize, Ordering},
Arc,
Arc, Mutex,
},
time::Duration,
};
use anyhow::bail;
use dashmap::DashMap;
use parking_lot::Mutex;
use itertools::Itertools;
use rand::{rngs::StdRng, Rng, SeedableRng};
use smol_str::SmolStr;
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
use tokio::time::{timeout, Instant};
use tokio::time::{timeout, Duration, Instant};
use tracing::info;
use super::{
@@ -29,60 +32,166 @@ use super::{
// saw SNI, before doing TLS handshake. User-side error messages in that case
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
// I went with a more expensive way that yields user-friendlier error messages.
//
// TODO: add a better bucketing here, e.g. not more than 300 requests per second,
// and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects
// are noramal during redeployments, so we should not block them.
pub struct EndpointRateLimiter {
map: DashMap<SmolStr, Arc<Mutex<(chrono::NaiveTime, u32)>>>,
max_rps: u32,
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
info: &'static [RateBucketInfo],
access_count: AtomicUsize,
rand: Mutex<Rand>,
}
#[derive(Clone, Copy)]
struct RateBucket {
start: Instant,
count: u32,
}
impl RateBucket {
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
if now - self.start < info.interval {
self.count < info.max_rpi
} else {
// bucket expired, reset
self.count = 0;
self.start = now;
true
}
}
fn inc(&mut self) {
self.count += 1;
}
}
#[derive(Clone, Copy, PartialEq)]
pub struct RateBucketInfo {
pub interval: Duration,
// requests per interval
pub max_rpi: u32,
}
impl std::fmt::Display for RateBucketInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
write!(f, "{rps}@{}", humantime::format_duration(self.interval))
}
}
impl std::fmt::Debug for RateBucketInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self}")
}
}
impl std::str::FromStr for RateBucketInfo {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let Some((max_rps, interval)) = s.split_once('@') else {
bail!("invalid rate info")
};
let max_rps = max_rps.parse()?;
let interval = humantime::parse_duration(interval)?;
Ok(Self::new(max_rps, interval))
}
}
impl RateBucketInfo {
pub const DEFAULT_SET: [Self; 3] = [
Self::new(300, Duration::from_secs(1)),
Self::new(200, Duration::from_secs(60)),
Self::new(100, Duration::from_secs(600)),
];
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
info.sort_unstable_by_key(|info| info.interval);
let invalid = info
.iter()
.tuple_windows()
.find(|(a, b)| a.max_rpi > b.max_rpi);
if let Some((a, b)) = invalid {
bail!(
"invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
b.max_rpi,
a.max_rpi,
);
}
Ok(())
}
pub const fn new(max_rps: u32, interval: Duration) -> Self {
Self {
interval,
max_rpi: max_rps * interval.as_millis() as u32 / 1000,
}
}
}
impl EndpointRateLimiter {
pub fn new(max_rps: u32) -> Self {
pub fn new(info: &'static [RateBucketInfo]) -> Self {
Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
}
}
impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
info!(buckets = ?info, "endpoint rate limiter");
Self {
map: DashMap::new(),
max_rps,
info,
map: DashMap::with_hasher_and_shard_amount(hasher, 64),
access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
rand: Mutex::new(rand),
}
}
/// Check that number of connections to the endpoint is below `max_rps` rps.
pub fn check(&self, endpoint: SmolStr) -> bool {
// do GC every 100k requests (worst case memory usage is about 10MB)
if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 {
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
// worst case memory usage is about:
// = 2 * 2048 * 64 * (48B + 72B)
// = 30MB
if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
self.do_gc();
}
let now = chrono::Utc::now().naive_utc().time();
let entry = self
.map
.entry(endpoint)
.or_insert_with(|| Arc::new(Mutex::new((now, 0))));
let mut entry = entry.lock();
let (last_time, count) = *entry;
let now = Instant::now();
let mut entry = self.map.entry(endpoint).or_insert_with(|| {
vec![
RateBucket {
start: now,
count: 0,
};
self.info.len()
]
});
if now - last_time < chrono::Duration::seconds(1) {
if count >= self.max_rps {
return false;
}
*entry = (last_time, count + 1);
} else {
*entry = (now, 1);
let should_allow_request = entry
.iter_mut()
.zip(self.info)
.all(|(bucket, info)| bucket.should_allow_request(info, now));
if should_allow_request {
// only increment the bucket counts if the request will actually be accepted
entry.iter_mut().for_each(RateBucket::inc);
}
true
should_allow_request
}
/// Clean the map. Simple strategy: remove all entries. At worst, we'll
/// double the effective max_rps during the cleanup. But that way deletion
/// does not aquire mutex on each entry access.
/// Clean the map. Simple strategy: remove all entries in a random shard.
/// At worst, we'll double the effective max_rps during the cleanup.
/// But that way deletion does not aquire mutex on each entry access.
pub fn do_gc(&self) {
info!(
"cleaning up endpoint rate limiter, current size = {}",
self.map.len()
);
self.map.clear();
let n = self.map.shards().len();
// this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
// (impossible, infact, unless we have 2048 threads)
let shard = self.rand.lock().unwrap().gen_range(0..n);
self.map.shards()[shard].write().clear();
}
}
@@ -124,7 +233,6 @@ pub struct Token<'t> {
#[derive(Debug, Clone, Copy)]
pub struct LimiterState {
limit: usize,
available: usize,
in_flight: usize,
}
@@ -302,11 +410,7 @@ impl Limiter {
pub fn state(&self) -> LimiterState {
let limit = self.limits.load(Ordering::Relaxed);
let in_flight = self.in_flight.load(Ordering::Relaxed);
LimiterState {
limit,
available: limit.saturating_sub(in_flight),
in_flight,
}
LimiterState { limit, in_flight }
}
}
@@ -319,13 +423,6 @@ impl<'t> Token<'t> {
}
}
#[cfg(test)]
pub fn set_latency(&mut self, latency: Duration) {
use std::ops::Sub;
self.start = Instant::now().sub(latency);
}
pub fn forget(&mut self) {
if let Some(permit) = self.permit.take() {
permit.forget();
@@ -344,10 +441,6 @@ impl LimiterState {
pub fn limit(&self) -> usize {
self.limit
}
/// The amount of concurrency available to use.
pub fn available(&self) -> usize {
self.available
}
/// The number of jobs in flight.
pub fn in_flight(&self) -> usize {
self.in_flight
@@ -395,12 +488,16 @@ impl reqwest_middleware::Middleware for Limiter {
#[cfg(test)]
mod tests {
use std::{pin::pin, task::Context, time::Duration};
use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
use futures::{task::noop_waker_ref, Future};
use rand::SeedableRng;
use rustc_hash::FxHasher;
use smol_str::SmolStr;
use tokio::time;
use super::{Limiter, Outcome};
use crate::rate_limiter::RateLimitAlgorithm;
use super::{EndpointRateLimiter, Limiter, Outcome};
use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
#[tokio::test]
async fn it_works() {
@@ -509,4 +606,105 @@ mod tests {
limiter.release(token1, None).await;
limiter.release(token2, None).await;
}
#[test]
fn rate_bucket_rpi() {
let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5));
assert_eq!(rate_bucket.max_rpi, 50 * 5);
let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500));
assert_eq!(rate_bucket.max_rpi, 50 / 2);
}
#[test]
fn rate_bucket_parse() {
let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap();
assert_eq!(rate_bucket.interval, Duration::from_secs(10));
assert_eq!(rate_bucket.max_rpi, 100 * 10);
assert_eq!(rate_bucket.to_string(), "100@10s");
let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap();
assert_eq!(rate_bucket.interval, Duration::from_secs(60));
assert_eq!(rate_bucket.max_rpi, 100 * 60);
assert_eq!(rate_bucket.to_string(), "100@1m");
}
#[test]
fn default_rate_buckets() {
let mut defaults = RateBucketInfo::DEFAULT_SET;
RateBucketInfo::validate(&mut defaults[..]).unwrap();
}
#[test]
#[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
fn rate_buckets_validate() {
let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
.into_iter()
.map(|s| s.parse().unwrap())
.collect();
RateBucketInfo::validate(&mut rates).unwrap();
}
#[tokio::test]
async fn test_rate_limits() {
let mut rates: Vec<RateBucketInfo> = ["100@1s", "20@30s"]
.into_iter()
.map(|s| s.parse().unwrap())
.collect();
RateBucketInfo::validate(&mut rates).unwrap();
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
let endpoint = SmolStr::from("ep-my-endpoint-1234");
time::pause();
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
// more connections fail
assert!(!limiter.check(endpoint.clone()));
// fail even after 500ms as it's in the same bucket
time::advance(time::Duration::from_millis(500)).await;
assert!(!limiter.check(endpoint.clone()));
// after a full 1s, 100 requests are allowed again
time::advance(time::Duration::from_millis(500)).await;
for _ in 1..6 {
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
time::advance(time::Duration::from_millis(1000)).await;
}
// more connections after 600 will exceed the 20rps@30s limit
assert!(!limiter.check(endpoint.clone()));
// will still fail before the 30 second limit
time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
assert!(!limiter.check(endpoint.clone()));
// after the full 30 seconds, 100 requests are allowed again
time::advance(time::Duration::from_millis(1)).await;
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
}
#[tokio::test]
async fn test_rate_limits_gc() {
// fixed seeded random/hasher to ensure that the test is not flaky
let rand = rand::rngs::StdRng::from_seed([1; 32]);
let hasher = BuildHasherDefault::<FxHasher>::default();
let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
&RateBucketInfo::DEFAULT_SET,
rand,
hasher,
);
for i in 0..1_000_000 {
limiter.check(format!("{i}").into());
}
assert!(limiter.map.len() < 150_000);
}
}

View File

@@ -8,12 +8,13 @@ mod websocket;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio_util::task::TaskTracker;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
use crate::rate_limiter::EndpointRateLimiter;
use crate::{cancellation::CancelMap, config::ProxyConfig};
use futures::StreamExt;
@@ -38,13 +39,13 @@ pub async fn task_main(
config: &'static ProxyConfig,
ws_listener: TcpListener,
cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("websocket server has shut down");
}
let conn_pool = conn_pool::GlobalConnPool::new(config);
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
// shutdown the connection pool
tokio::spawn({
@@ -149,22 +150,17 @@ pub async fn task_main(
struct MetricService<S> {
inner: S,
_gauge: IntCounterPairGuard,
}
impl<S> MetricService<S> {
fn new(inner: S) -> MetricService<S> {
NUM_CLIENT_CONNECTION_OPENED_COUNTER
.with_label_values(&["http"])
.inc();
MetricService { inner }
}
}
impl<S> Drop for MetricService<S> {
fn drop(&mut self) {
NUM_CLIENT_CONNECTION_CLOSED_COUNTER
.with_label_values(&["http"])
.inc();
MetricService {
inner,
_gauge: NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&["http"])
.guard(),
}
}
}
@@ -248,7 +244,7 @@ async fn request_handler(
.header("Access-Control-Allow-Origin", "*")
.header(
"Access-Control-Allow-Headers",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code

View File

@@ -24,10 +24,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
console,
proxy::{
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
NUM_DB_CONNECTIONS_OPENED_COUNTER,
},
proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
};
use crate::{compute, config};
@@ -477,6 +474,11 @@ async fn connect_to_compute_once(
.connect_timeout(timeout)
.connect(tokio_postgres::NoTls)
.await?;
let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&["http"])
.guard();
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
let (tx, mut rx) = tokio::sync::watch::channel(session);
@@ -492,10 +494,7 @@ async fn connect_to_compute_once(
tokio::spawn(
async move {
NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc();
scopeguard::defer! {
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
}
let _conn_gauge = conn_gauge;
poll_fn(move |cx| {
if matches!(rx.has_changed(), Ok(true)) {
session = *rx.borrow_and_update();

View File

@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
use utils::http::json::json_response;
use crate::config::HttpConfig;
use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER};
use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;
use super::conn_pool::ConnInfo;
use super::conn_pool::GlobalConnPool;
@@ -303,12 +303,9 @@ async fn handle_inner(
session_id: uuid::Uuid,
peer_addr: IpAddr,
) -> anyhow::Result<Response<Body>> {
NUM_CONNECTIONS_ACCEPTED_COUNTER
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&["http"])
.inc();
scopeguard::defer! {
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
}
.guard();
//
// Determine the destination and connection params

View File

@@ -27,15 +27,15 @@ use sync_wrapper::SyncWrapper;
pin_project! {
/// This is a wrapper around a [`WebSocketStream`] that
/// implements [`AsyncRead`] and [`AsyncWrite`].
pub struct WebSocketRw {
pub struct WebSocketRw<S = Upgraded> {
#[pin]
stream: SyncWrapper<WebSocketStream<Upgraded>>,
stream: SyncWrapper<WebSocketStream<S>>,
bytes: Bytes,
}
}
impl WebSocketRw {
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
impl<S> WebSocketRw<S> {
pub fn new(stream: WebSocketStream<S>) -> Self {
Self {
stream: stream.into(),
bytes: Bytes::new(),
@@ -43,7 +43,7 @@ impl WebSocketRw {
}
}
impl AsyncWrite for WebSocketRw {
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
@@ -69,7 +69,7 @@ impl AsyncWrite for WebSocketRw {
}
}
impl AsyncRead for WebSocketRw {
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
@@ -86,7 +86,7 @@ impl AsyncRead for WebSocketRw {
}
}
impl AsyncBufRead for WebSocketRw {
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
// Please refer to poll_fill_buf's documentation.
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
@@ -151,3 +151,60 @@ pub async fn serve_websocket(
.await?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::pin::pin;
use futures::{SinkExt, StreamExt};
use hyper_tungstenite::{
tungstenite::{protocol::Role, Message},
WebSocketStream,
};
use tokio::{
io::{duplex, AsyncReadExt, AsyncWriteExt},
task::JoinSet,
};
use super::WebSocketRw;
#[tokio::test]
async fn websocket_stream_wrapper_happy_path() {
let (stream1, stream2) = duplex(1024);
let mut js = JoinSet::new();
js.spawn(async move {
let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await;
client
.send(Message::Binary(b"hello world".to_vec()))
.await
.unwrap();
let message = client.next().await.unwrap().unwrap();
assert_eq!(message, Message::Binary(b"websockets are cool".to_vec()));
client.close(None).await.unwrap();
});
js.spawn(async move {
let mut rw = pin!(WebSocketRw::new(
WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
));
let mut buf = vec![0; 1024];
let n = rw.read(&mut buf).await.unwrap();
assert_eq!(&buf[..n], b"hello world");
rw.write_all(b"websockets are cool").await.unwrap();
rw.flush().await.unwrap();
let n = rw.read_to_end(&mut buf).await.unwrap();
assert_eq!(n, 0);
});
js.join_next().await.unwrap().unwrap();
js.join_next().await.unwrap().unwrap();
}
}

View File

@@ -31,6 +31,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] }
pageserver = { path = "../pageserver" }
pageserver_api = { path = "../libs/pageserver_api" }
remote_storage = { path = "../libs/remote_storage" }
tracing.workspace = true

View File

@@ -7,13 +7,12 @@ use utils::generation::Generation;
use crate::cloud_admin_api::BranchData;
use crate::metadata_stream::stream_listing;
use crate::{download_object_with_retries, RootTarget};
use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
use futures_util::{pin_mut, StreamExt};
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::IndexPart;
use remote_storage::RemotePath;
use utils::id::TenantTimelineId;
pub(crate) struct TimelineAnalysis {
/// Anomalies detected
@@ -39,8 +38,8 @@ impl TimelineAnalysis {
}
}
pub(crate) async fn branch_cleanup_and_check_errors(
id: &TenantTimelineId,
pub(crate) fn branch_cleanup_and_check_errors(
id: &TenantShardTimelineId,
s3_root: &RootTarget,
s3_active_branch: Option<&BranchData>,
console_branch: Option<BranchData>,
@@ -238,7 +237,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St
pub(crate) async fn list_timeline_blobs(
s3_client: &Client,
id: TenantTimelineId,
id: TenantShardTimelineId,
s3_root: &RootTarget,
) -> anyhow::Result<S3TimelineBlobData> {
let mut s3_layers = HashSet::new();

View File

@@ -10,15 +10,16 @@ use aws_sdk_s3::{
Client,
};
use futures_util::{pin_mut, TryStreamExt};
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use tokio_stream::StreamExt;
use utils::id::{TenantId, TenantTimelineId};
use utils::id::TenantId;
use crate::{
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
init_remote,
metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth,
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
};
#[derive(Serialize, Deserialize, Debug)]
@@ -29,8 +30,8 @@ enum GarbageReason {
#[derive(Serialize, Deserialize, Debug)]
enum GarbageEntity {
Tenant(TenantId),
Timeline(TenantTimelineId),
Tenant(TenantShardId),
Timeline(TenantShardTimelineId),
}
#[derive(Serialize, Deserialize, Debug)]
@@ -142,6 +143,9 @@ async fn find_garbage_inner(
console_projects.len()
);
// TODO(sharding): batch calls into Console so that we only call once for each TenantId,
// rather than checking the same TenantId for multiple TenantShardId
// Enumerate Tenants in S3, and check if each one exists in Console
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
let tenants = stream_tenants(&s3_client, &target);
@@ -149,10 +153,10 @@ async fn find_garbage_inner(
let api_client = cloud_admin_api_client.clone();
let console_projects = &console_projects;
async move {
match console_projects.get(&t) {
match console_projects.get(&t.tenant_id) {
Some(project_data) => Ok((t, Some(project_data.clone()))),
None => api_client
.find_tenant_project(t)
.find_tenant_project(t.tenant_id)
.await
.map_err(|e| anyhow::anyhow!(e))
.map(|r| (t, r)),
@@ -166,21 +170,21 @@ async fn find_garbage_inner(
// checks if they are enabled by the `depth` parameter.
pin_mut!(tenants_checked);
let mut garbage = GarbageList::new(node_kind, bucket_config);
let mut active_tenants: Vec<TenantId> = vec![];
let mut active_tenants: Vec<TenantShardId> = vec![];
let mut counter = 0;
while let Some(result) = tenants_checked.next().await {
let (tenant_id, console_result) = result?;
let (tenant_shard_id, console_result) = result?;
// Paranoia check
if let Some(project) = &console_result {
assert!(project.tenant == tenant_id);
assert!(project.tenant == tenant_shard_id.tenant_id);
}
if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) {
tracing::debug!("Tenant {tenant_id} is garbage");
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
tracing::debug!("Tenant {tenant_shard_id} is garbage");
} else {
tracing::debug!("Tenant {tenant_id} is active");
active_tenants.push(tenant_id);
tracing::debug!("Tenant {tenant_shard_id} is active");
active_tenants.push(tenant_shard_id);
}
counter += 1;
@@ -266,13 +270,13 @@ impl std::fmt::Display for PurgeMode {
pub async fn get_tenant_objects(
s3_client: &Arc<Client>,
target: RootTarget,
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<Vec<ObjectIdentifier>> {
tracing::debug!("Listing objects in tenant {tenant_id}");
tracing::debug!("Listing objects in tenant {tenant_shard_id}");
// TODO: apply extra validation based on object modification time. Don't purge
// tenants where any timeline's index_part.json has been touched recently.
let mut tenant_root = target.tenant_root(&tenant_id);
let mut tenant_root = target.tenant_root(&tenant_shard_id);
// Remove delimiter, so that object listing lists all keys in the prefix and not just
// common prefixes.
@@ -285,7 +289,7 @@ pub async fn get_tenant_objects(
pub async fn get_timeline_objects(
s3_client: &Arc<Client>,
target: RootTarget,
ttid: TenantTimelineId,
ttid: TenantShardTimelineId,
) -> anyhow::Result<Vec<ObjectIdentifier>> {
tracing::debug!("Listing objects in timeline {ttid}");
let mut timeline_root = target.timeline_root(&ttid);

View File

@@ -22,6 +22,7 @@ use aws_sdk_s3::{Client, Config};
use clap::ValueEnum;
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::io::IsTerminal;
@@ -29,7 +30,7 @@ use tokio::io::AsyncReadExt;
use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
use utils::id::{TenantId, TenantTimelineId};
use utils::id::TimelineId;
const MAX_RETRIES: usize = 20;
const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -44,6 +45,35 @@ pub struct S3Target {
pub delimiter: String,
}
/// Convenience for referring to timelines within a particular shard: more ergonomic
/// than using a 2-tuple.
///
/// This is the shard-aware equivalent of TenantTimelineId. It's defined here rather
/// than somewhere more broadly exposed, because this kind of thing is rarely needed
/// in the pageserver, as all timeline objects existing in the scope of a particular
/// tenant: the scrubber is different in that it handles collections of data referring to many
/// TenantShardTimelineIds in on place.
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub struct TenantShardTimelineId {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
}
impl TenantShardTimelineId {
fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self {
Self {
tenant_shard_id,
timeline_id,
}
}
}
impl Display for TenantShardTimelineId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id)
}
}
#[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
pub enum TraversingDepth {
Tenant,
@@ -110,19 +140,19 @@ impl RootTarget {
}
}
pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
self.tenants_root().with_sub_segment(&tenant_id.to_string())
}
pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
match self {
Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
Self::Safekeeper(_) => self.tenant_root(tenant_id),
}
}
pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
self.timelines_root(&id.tenant_id)
pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target {
self.timelines_root(&id.tenant_shard_id)
.with_sub_segment(&id.timeline_id.to_string())
}

View File

@@ -3,14 +3,15 @@ use async_stream::{stream, try_stream};
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use tokio_stream::Stream;
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId};
use utils::id::{TenantTimelineId, TimelineId};
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
use pageserver_api::shard::TenantShardId;
use utils::id::TimelineId;
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
pub fn stream_tenants<'a>(
s3_client: &'a Client,
target: &'a RootTarget,
) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
try_stream! {
let mut continuation_token = None;
let tenants_target = target.tenants_root();
@@ -44,14 +45,14 @@ pub fn stream_tenants<'a>(
}
}
/// Given a TenantId, output a stream of the timelines within that tenant, discovered
/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
/// using ListObjectsv2. The listing is done before the stream is built, so that this
/// function can be used to generate concurrency on a stream using buffer_unordered.
pub async fn stream_tenant_timelines<'a>(
s3_client: &'a Client,
target: &'a RootTarget,
tenant: TenantId,
) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
tenant: TenantShardId,
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
let mut continuation_token = None;
let timelines_target = target.timelines_root(&tenant);
@@ -98,7 +99,7 @@ pub async fn stream_tenant_timelines<'a>(
Ok(stream! {
for i in timeline_ids {
let id = i?;
yield Ok(TenantTimelineId::new(tenant, id));
yield Ok(TenantShardTimelineId::new(tenant, id));
}
})
}

View File

@@ -5,20 +5,19 @@ use crate::checks::{
TimelineAnalysis,
};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use aws_sdk_s3::Client;
use futures_util::{pin_mut, StreamExt, TryStreamExt};
use histogram::Histogram;
use pageserver::tenant::IndexPart;
use serde::Serialize;
use utils::id::TenantTimelineId;
#[derive(Serialize)]
pub struct MetadataSummary {
count: usize,
with_errors: HashSet<TenantTimelineId>,
with_warnings: HashSet<TenantTimelineId>,
with_garbage: HashSet<TenantTimelineId>,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
with_garbage: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
layer_count: MinMaxHisto,
@@ -132,7 +131,7 @@ impl MetadataSummary {
}
}
fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
}
@@ -199,8 +198,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
async fn report_on_timeline(
s3_client: &Client,
target: &RootTarget,
ttid: TenantTimelineId,
) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
ttid: TenantShardTimelineId,
) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
let data = list_timeline_blobs(s3_client, ttid, target).await?;
Ok((ttid, data))
}
@@ -213,8 +212,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
let (ttid, data) = i?;
summary.update_data(&data);
let analysis =
branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;
let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
summary.update_analysis(&ttid, &analysis);
}

View File

@@ -11,7 +11,7 @@ use tracing::{debug, info, info_span, Instrument};
use crate::auth::check_permission;
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
use crate::safekeeper::Term;
use crate::timeline::TimelineError;
use crate::wal_service::ConnectionId;
@@ -210,10 +210,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
let cmd = parse_cmd(query_string)?;
let cmd_str = cmd_to_string(&cmd);
PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc();
scopeguard::defer! {
PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc();
}
let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
info!("got query {:?}", query_string);

View File

@@ -11,7 +11,8 @@ use futures::Future;
use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -89,16 +90,10 @@ pub static BROKER_PULLED_UPDATES: Lazy<IntCounterVec> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_broker_pulled_updates_total counter")
});
pub static PG_QUERIES_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static PG_QUERIES_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"safekeeper_pg_queries_received_total",
"Number of queries received through pg protocol",
&["query"]
)
.expect("Failed to register safekeeper_pg_queries_received_total counter")
});
pub static PG_QUERIES_FINISHED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"safekeeper_pg_queries_finished_total",
"Number of queries finished through pg protocol",
&["query"]

View File

@@ -1870,11 +1870,12 @@ class NeonPageserver(PgProtocol):
tenant_id: TenantId,
conf: Optional[Dict[str, Any]] = None,
auth_token: Optional[str] = None,
generation: Optional[int] = None,
) -> TenantId:
if generation is None:
generation = self.maybe_get_generation(tenant_id)
client = self.http_client(auth_token=auth_token)
return client.tenant_create(
tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
)
return client.tenant_create(tenant_id, conf, generation=generation)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
@@ -2944,7 +2945,7 @@ class Safekeeper:
tli_dir = self.timeline_dir(tenant_id, timeline_id)
segments = []
for _, _, filenames in os.walk(tli_dir):
segments.extend([f for f in filenames if f != "safekeeper.control"])
segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
segments.sort()
return segments

View File

@@ -79,6 +79,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
# and it is not a failure of our code when it happens.
".*DeleteObjects.*We encountered an internal error. Please try again.*",
# During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this
# up is tracked in https://github.com/neondatabase/neon/issues/6096
".*Cancelled, shutting down.*",
)

View File

@@ -322,6 +322,10 @@ class PageserverHttpClient(requests.Session):
self.verbose_error(res)
return TenantConfig.from_json(res.json())
def tenant_heatmap_upload(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
self.verbose_error(res)
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
assert "tenant_id" not in config.keys()
res = self.put(

View File

@@ -16,6 +16,7 @@ from fixtures.log_helper import log
from fixtures.types import TenantId, TimelineId
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
@enum.unique
@@ -133,6 +134,13 @@ class LocalFsStorage:
with self.index_path(tenant_id, timeline_id).open("r") as f:
return json.load(f)
def heatmap_path(self, tenant_id: TenantId) -> Path:
return self.tenant_path(tenant_id) / TENANT_HEATMAP_FILE_NAME
def heatmap_content(self, tenant_id):
with self.heatmap_path(tenant_id).open("r") as f:
return json.load(f)
def to_toml_inline_table(self) -> str:
rv = {
"local_path": str(self.root),

View File

@@ -55,9 +55,20 @@ def measure_recovery_time(env: NeonCompare):
# Delete the Tenant in the pageserver: this will drop local and remote layers, such that
# when we "create" the Tenant again, we will replay the WAL from the beginning.
#
# This is a "weird" thing to do, and can confuse the attachment service as we're re-using
# the same tenant ID for a tenant that is logically different from the pageserver's point
# of view, but the same as far as the safekeeper/WAL is concerned. To work around that,
# we will explicitly create the tenant in the same generation that it was previously
# attached in.
assert env.env.attachment_service is not None
attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
assert attach_status is not None
(attach_gen, _) = attach_status
client.tenant_delete(env.tenant)
wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
env.env.pageserver.tenant_create(tenant_id=env.tenant)
env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
# Measure recovery time
with env.record_duration("wal_recovery"):

View File

@@ -52,7 +52,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
TenantId(t["id"]) for t in ps_http.tenant_list()
], "tenant should not be attached after negative test"
env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request")
env.pageserver.allowed_errors.extend(
[
# This fixture detaches the tenant, and tests using it will tend to re-attach it
# shortly after. There may be un-processed deletion_queue validations from the
# initial attachment
".*Dropped remote consistent LSN updates.*",
# This fixture is for tests that will intentionally generate 400 responses
".*Error processing HTTP request: Bad request",
]
)
def log_contains_bad_request():
env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
@@ -163,6 +172,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"gc_feedback": True,
"gc_horizon": 23 * (1024 * 1024),
"gc_period": "2h 13m",
"heatmap_period": "10m",
"image_creation_threshold": 7,
"pitr_interval": "1m",
"lagging_wal_timeout": "23m",

View File

@@ -1,8 +1,7 @@
import random
import threading
import time
from queue import SimpleQueue
from typing import Any, Dict, List, Union
from typing import List
import pytest
from fixtures.log_helper import log
@@ -239,92 +238,6 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
t.join()
def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder):
"""
If the activate only after upload is used, then retries could become competing.
"""
env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.extend(
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory",
]
)
ps_http = env.pageserver.http_client()
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
env.pageserver.tenant_create(env.initial_tenant)
def start_creating_timeline():
ps_http.timeline_create(
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
)
create_root = threading.Thread(target=start_creating_timeline)
branch_id = TimelineId.generate()
queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue()
barrier = threading.Barrier(3)
def try_branch():
barrier.wait()
barrier.wait()
try:
ret = ps_http.timeline_create(
env.pg_version,
env.initial_tenant,
branch_id,
ancestor_timeline_id=env.initial_timeline,
timeout=5,
)
queue.put(ret)
except Exception as e:
queue.put(e)
threads = [threading.Thread(target=try_branch) for _ in range(2)]
try:
create_root.start()
for t in threads:
t.start()
wait_until_paused(env, "before-upload-index-pausable")
barrier.wait()
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
barrier.wait()
# now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files
first = queue.get()
second = queue.get()
log.info(first)
log.info(second)
(succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first)
assert isinstance(failed, Exception)
assert isinstance(succeeded, Dict)
# there's multiple valid status codes:
# - Timeline x/y already exists
# - whatever 409 response says, but that is a subclass of PageserverApiException
assert isinstance(failed, PageserverApiException)
assert succeeded["state"] == "Active"
finally:
# we might still have the failpoint active
env.pageserver.stop(immediate=True)
for t in threads:
t.join()
create_root.join()
def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
"""
Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.

View File

@@ -273,9 +273,24 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
timeline_id = env.initial_timeline
pg_version = env.pg_version
shutil.rmtree(repo_dir / "local_fs_remote_storage")
# Delete all files from local_fs_remote_storage except initdb.tar.zst,
# the file is required for `timeline_create` with `existing_initdb_timeline_id`.
#
# TODO: switch to Path.walk() in Python 3.12
# for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
for filename in filenames:
if filename != "initdb.tar.zst":
(Path(dirpath) / filename).unlink()
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
pageserver_http.timeline_create(
pg_version=pg_version,
tenant_id=tenant_id,
new_timeline_id=timeline_id,
existing_initdb_timeline_id=timeline_id,
)
pg_bin.run_capture(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
)

View File

@@ -4,7 +4,7 @@ from typing import Any, Dict, Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.remote_storage import RemoteStorageKind
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -330,3 +330,46 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
workload.churn_rows(64, pageserver_b.id)
workload.validate(pageserver_b.id)
def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
"""
Test the sequence of location states that are used in a live migration.
"""
env = neon_env_builder.init_start() # initial_tenant_conf=TENANT_CONF)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Write some data so that we have some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
# Write some layers and upload a heatmap
workload.write_rows(256, env.pageservers[0].id)
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
def validate_heatmap(heatmap):
assert len(heatmap["timelines"]) == 1
assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
assert len(heatmap["timelines"][0]["layers"]) > 0
layers = heatmap["timelines"][0]["layers"]
# Each layer appears at most once
assert len(set(layer["name"] for layer in layers)) == len(layers)
# Download and inspect the heatmap that the pageserver uploaded
heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.info(f"Read back heatmap: {heatmap_first}")
validate_heatmap(heatmap_first)
# Do some more I/O to generate more layers
workload.churn_rows(64, env.pageservers[0].id)
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
# Ensure that another heatmap upload includes the new layers
heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.info(f"Read back heatmap: {heatmap_second}")
assert heatmap_second != heatmap_first
validate_heatmap(heatmap_second)

View File

@@ -300,7 +300,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
tenant_id, timeline_id = env.neon_cli.create_tenant()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# load in some data
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -732,3 +733,142 @@ def wait_for_timeline_size_init(
raise Exception(
f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
)
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
"""
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
before proceeding. However, they skip this if a client is actively trying to access them.
This test is not purely about logical sizes, but logical size calculation is the phase that we
use as a proxy for "warming up" in this test: it happens within the semaphore guard used
to limit concurrent tenant warm-up.
"""
# We will run with the limit set to 1, so that once we have one tenant stuck
# in a pausable failpoint, the rest are prevented from proceeding through warmup.
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# Create some tenants
n_tenants = 10
tenant_ids = {env.initial_tenant}
for _i in range(0, n_tenants - 1):
tenant_id = TenantId.generate()
env.pageserver.tenant_create(tenant_id)
# Empty tenants are not subject to waiting for logical size calculations, because
# those hapen on timeline level
timeline_id = TimelineId.generate()
env.neon_cli.create_timeline(
new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id
)
tenant_ids.add(tenant_id)
# Restart pageserver with logical size calculations paused
env.pageserver.stop()
env.pageserver.start(
extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
)
def get_tenant_states():
states = {}
for tenant_id in tenant_ids:
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
states[tenant_id] = tenant["state"]["slug"]
log.info(f"Tenant states: {states}")
return states
def at_least_one_active():
assert "Active" in set(get_tenant_states().values())
# One tenant should activate, then get stuck in their logical size calculation
wait_until(10, 1, at_least_one_active)
# Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
time.sleep(5)
# We should see one tenant win the activation race, and enter logical size calculation. The rest
# will stay in Attaching state, waiting for the "warmup_limit" semaphore
expect_activated = 1
states = get_tenant_states()
assert len([s for s in states.values() if s == "Active"]) == expect_activated
assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
)
# This is zero, and subsequent checks are expect_activated - 1, because this counter does not
# count how may tenants are Active, it counts how many have finished warmup. The first tenant
# that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
# If a client accesses one of the blocked tenants, it should skip waiting for warmup and
# go active as fast as it can.
stuck_tenant_id = list(
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
)[0][0]
endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
endpoint.safe_psql_many(
[
"CREATE TABLE foo (x INTEGER)",
"INSERT INTO foo SELECT g FROM generate_series(1, 10) g",
]
)
endpoint.stop()
# That one that we successfully accessed is now Active
expect_activated += 1
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
# The ones we didn't touch are still in Attaching
assert (
len([s for s in get_tenant_states().values() if s == "Attaching"])
== n_tenants - expect_activated
)
# Timeline creation operations also wake up Attaching tenants
stuck_tenant_id = list(
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
)[0][0]
pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate())
expect_activated += 1
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
assert (
len([s for s in get_tenant_states().values() if s == "Attaching"])
== n_tenants - expect_activated
)
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
# When we unblock logical size calculation, all tenants should proceed to active state via
# the warmup route.
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
def all_active():
assert all(s == "Active" for s in get_tenant_states().values())
wait_until(10, 1, all_active)
# Final control check: restarting with no failpoints at all results in all tenants coming active
# without being prompted by client I/O
env.pageserver.stop()
env.pageserver.start()
wait_until(10, 1, all_active)
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
)
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants

View File

@@ -1,7 +1,6 @@
import sys
import tarfile
import tempfile
import time
from pathlib import Path
import pytest
@@ -12,6 +11,7 @@ from fixtures.neon_fixtures import (
PgBin,
VanillaPostgres,
)
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import LocalFsStorage
from fixtures.types import Lsn, TenantId, TimelineId
@@ -128,10 +128,7 @@ def test_wal_restore_initdb(
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
def test_wal_restore_http(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
):
def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -145,15 +142,7 @@ def test_wal_restore_http(
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
test_output_dir / "initdb.tar.zst"
(env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
ps_client.timeline_delete(tenant_id, timeline_id)
time.sleep(2)
# verify that it is indeed deleted
# TODO
timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
# issue the restoration command
ps_client.timeline_create(

View File

@@ -1,5 +1,5 @@
{
"postgres-v16": "e3a22b72922055f9212eca12700190f118578362",
"postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
"postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a"
"postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
"postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
"postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
}

View File

@@ -34,7 +34,7 @@ files:
server_tls_sslmode=disable
pool_mode=transaction
max_client_conn=10000
default_pool_size=16
default_pool_size=64
max_prepared_statements=0
- filename: cgconfig.conf
content: |