diff --git a/Cargo.lock b/Cargo.lock index 1b6b423444..f931fd6c29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -881,7 +881,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.28", + "syn 2.0.32", "which", ] @@ -1095,7 +1095,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1245,16 +1245,19 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "camino", "clap", "comfy-table", "compute_api", + "futures", "git-version", "hex", "hyper", "nix 0.26.2", "once_cell", "pageserver_api", + "pageserver_client", "postgres", "postgres_backend", "postgres_connection", @@ -1268,6 +1271,8 @@ dependencies = [ "tar", "thiserror", "tokio", + "tokio-postgres", + "tokio-util", "toml", "tracing", "url", @@ -1481,7 +1486,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1492,7 +1497,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1567,7 +1572,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1661,7 +1666,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -1915,7 +1920,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -2901,7 +2906,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -3103,6 +3108,7 @@ dependencies = [ "humantime-serde", "hyper", "itertools", + "md5", "metrics", "nix 0.26.2", "num-traits", @@ -3161,6 +3167,7 @@ dependencies = [ "enum-map", "hex", "postgres_ffi", + "rand 0.8.5", "serde", "serde_json", "serde_with", @@ -3171,6 +3178,19 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client" +version = "0.1.0" +dependencies = [ + "async-trait", + "pageserver_api", + "reqwest", + "serde", + "thiserror", + "utils", + "workspace_hack", +] + [[package]] name = "parking" version = "2.1.1" @@ -3330,7 +3350,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -3537,7 +3557,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -4145,7 +4165,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.28", + "syn 2.0.32", "unicode-ident", ] @@ -4291,6 +4311,7 @@ dependencies = [ "histogram", "itertools", "pageserver", + "pageserver_api", "rand 0.8.5", "remote_storage", "reqwest", @@ -4579,7 +4600,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -4660,7 +4681,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -4927,9 +4948,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.28" +version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", @@ -5059,7 +5080,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5177,7 +5198,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5478,7 +5499,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] @@ -5764,6 +5785,7 @@ dependencies = [ "serde", "serde_assert", "serde_json", + "serde_path_to_error", "serde_with", "signal-hook", "strum", @@ -5922,7 +5944,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -5956,7 +5978,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6293,7 +6315,7 @@ dependencies = [ "smallvec", "subtle", "syn 1.0.109", - "syn 2.0.28", + "syn 2.0.32", "time", "time-macros", "tokio", @@ -6355,22 +6377,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.3" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74" +checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.3" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd" +checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.32", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 496a9d7839..b44544d626 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "control_plane", "pageserver", "pageserver/ctl", + "pageserver/client", "proxy", "safekeeper", "storage_broker", @@ -182,6 +183,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } +pageserver_client = { path = "./pageserver/client" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 7ccddc161e..898ad05add 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -6,9 +6,11 @@ license.workspace = true [dependencies] anyhow.workspace = true +async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true +futures.workspace = true git-version.workspace = true nix.workspace = true once_cell.workspace = true @@ -24,10 +26,11 @@ tar.workspace = true thiserror.workspace = true toml.workspace = true tokio.workspace = true +tokio-postgres.workspace = true +tokio-util.workspace = true url.workspace = true -# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api -# instead, so that recompile times are better. pageserver_api.workspace = true +pageserver_client.workspace = true postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 822ac7d8a6..731c05809e 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -9,7 +9,7 @@ pub struct AttachmentService { env: LocalEnv, listen: String, path: PathBuf, - client: reqwest::blocking::Client, + client: reqwest::Client, } const COMMAND: &str = "attachment_service"; @@ -53,7 +53,7 @@ impl AttachmentService { env: env.clone(), path, listen, - client: reqwest::blocking::ClientBuilder::new() + client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), } @@ -64,7 +64,7 @@ impl AttachmentService { .expect("non-Unicode path") } - pub fn start(&self) -> anyhow::Result { + pub async fn start(&self) -> anyhow::Result { let path_str = self.path.to_string_lossy(); background_process::start_process( @@ -73,10 +73,11 @@ impl AttachmentService { &self.env.attachment_service_bin(), ["-l", &self.listen, "-p", &path_str], [], - background_process::InitialPidFile::Create(&self.pid_file()), + background_process::InitialPidFile::Create(self.pid_file()), // TODO: a real status check - || Ok(true), + || async move { anyhow::Ok(true) }, ) + .await } pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { @@ -84,7 +85,7 @@ impl AttachmentService { } /// Call into the attach_hook API, for use before handing out attachments to pageservers - pub fn attach_hook( + pub async fn attach_hook( &self, tenant_id: TenantId, pageserver_id: NodeId, @@ -104,16 +105,16 @@ impl AttachmentService { node_id: Some(pageserver_id), }; - let response = self.client.post(url).json(&request).send()?; + let response = self.client.post(url).json(&request).send().await?; if response.status() != StatusCode::OK { return Err(anyhow!("Unexpected status {}", response.status())); } - let response = response.json::()?; + let response = response.json::().await?; Ok(response.gen) } - pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result> { + pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result> { use hyper::StatusCode; let url = self @@ -126,12 +127,12 @@ impl AttachmentService { let request = InspectRequest { tenant_id }; - let response = self.client.post(url).json(&request).send()?; + let response = self.client.post(url).json(&request).send().await?; if response.status() != StatusCode::OK { return Err(anyhow!("Unexpected status {}", response.status())); } - let response = response.json::()?; + let response = response.json::().await?; Ok(response.attachment) } } diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 26fc08fc8f..20fa3af9b8 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. -pub enum InitialPidFile<'t> { +pub enum InitialPidFile { /// Create a pidfile, to allow future CLI invocations to manipulate the process. - Create(&'t Utf8Path), + Create(Utf8PathBuf), /// The process will create the pidfile itself, need to wait for that event. - Expect(&'t Utf8Path), + Expect(Utf8PathBuf), } /// Start a background child process using the parameters given. -pub fn start_process( +pub async fn start_process( process_name: &str, datadir: &Path, command: &Path, @@ -62,7 +62,8 @@ pub fn start_process( process_status_check: F, ) -> anyhow::Result where - F: Fn() -> anyhow::Result, + F: Fn() -> Fut, + Fut: std::future::Future>, AI: IntoIterator, A: AsRef, // Not generic AsRef, otherwise empty `envs` prevents type inference @@ -89,7 +90,7 @@ where let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); filled_cmd.envs(envs); - let pid_file_to_check = match initial_pid_file { + let pid_file_to_check = match &initial_pid_file { InitialPidFile::Create(path) => { pre_exec_create_pidfile(filled_cmd, path); path @@ -107,7 +108,7 @@ where ); for retries in 0..RETRIES { - match process_started(pid, Some(pid_file_to_check), &process_status_check) { + match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { println!("\n{process_name} started, pid: {pid}"); return Ok(spawned_process); @@ -316,22 +317,20 @@ where cmd } -fn process_started( +async fn process_started( pid: Pid, - pid_file_to_check: Option<&Utf8Path>, + pid_file_to_check: &Utf8Path, status_check: &F, ) -> anyhow::Result where - F: Fn() -> anyhow::Result, + F: Fn() -> Fut, + Fut: std::future::Future>, { - match status_check() { - Ok(true) => match pid_file_to_check { - Some(pid_file_path) => match pid_file::read(pid_file_path)? { - PidFileRead::NotExist => Ok(false), - PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), - PidFileRead::NotHeldByAnyProcess(_) => Ok(false), - }, - None => Ok(true), + match status_check().await { + Ok(true) => match pid_file::read(pid_file_to_check)? { + PidFileRead::NotExist => Ok(false), + PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), + PidFileRead::NotHeldByAnyProcess(_) => Ok(false), }, Ok(false) => Ok(false), Err(e) => anyhow::bail!("process failed to start: {e}"), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f0b929ac6..03e69010f7 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -120,15 +120,20 @@ fn main() -> Result<()> { let mut env = LocalEnv::load_config().context("Error loading config")?; let original_env = env.clone(); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let subcommand_result = match sub_name { - "tenant" => handle_tenant(sub_args, &mut env), - "timeline" => handle_timeline(sub_args, &mut env), - "start" => handle_start_all(sub_args, &env), + "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), + "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), + "start" => rt.block_on(handle_start_all(sub_args, &env)), "stop" => handle_stop_all(sub_args, &env), - "pageserver" => handle_pageserver(sub_args, &env), - "attachment_service" => handle_attachment_service(sub_args, &env), - "safekeeper" => handle_safekeeper(sub_args, &env), - "endpoint" => handle_endpoint(sub_args, &env), + "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), + "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), + "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), + "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), @@ -269,12 +274,13 @@ fn print_timeline( /// Returns a map of timeline IDs to timeline_id@lsn strings. /// Connects to the pageserver to query this information. -fn get_timeline_infos( +async fn get_timeline_infos( env: &local_env::LocalEnv, tenant_id: &TenantId, ) -> Result> { Ok(get_default_pageserver(env) - .timeline_list(tenant_id)? + .timeline_list(tenant_id) + .await? .into_iter() .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) .collect()) @@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_tenant( + tenant_match: &ArgMatches, + env: &mut local_env::LocalEnv, +) -> anyhow::Result<()> { let pageserver = get_default_pageserver(env); match tenant_match.subcommand() { Some(("list", _)) => { - for t in pageserver.tenant_list()? { + for t in pageserver.tenant_list().await? { println!("{} {:?}", t.id, t.state); } } @@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // We must register the tenant with the attachment service, so // that when the pageserver restarts, it will be re-attached. let attachment_service = AttachmentService::from_env(env); - attachment_service.attach_hook(tenant_id, pageserver.conf.id)? + attachment_service + .attach_hook(tenant_id, pageserver.conf.id) + .await? } else { None }; - pageserver.tenant_create(tenant_id, generation, tenant_conf)?; + pageserver + .tenant_create(tenant_id, generation, tenant_conf) + .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant @@ -409,14 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .copied() .context("Failed to parse postgres version from the argument string")?; - let timeline_info = pageserver.timeline_create( - tenant_id, - new_timeline_id, - None, - None, - Some(pg_version), - None, - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -450,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an pageserver .tenant_config(tenant_id, tenant_conf) + .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } @@ -458,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an let new_pageserver = get_pageserver(env, matches)?; let new_pageserver_id = new_pageserver.conf.id; - migrate_tenant(env, tenant_id, new_pageserver)?; + migrate_tenant(env, tenant_id, new_pageserver).await?; println!("tenant {tenant_id} migrated to {}", new_pageserver_id); } @@ -468,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an Ok(()) } -fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = get_default_pageserver(env); match timeline_match.subcommand() { Some(("list", list_match)) => { let tenant_id = get_tenant_id(list_match, env)?; - let timelines = pageserver.timeline_list(&tenant_id)?; + let timelines = pageserver.timeline_list(&tenant_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } Some(("create", create_match)) => { @@ -490,14 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_timeline_id_opt = parse_timeline_id(create_match)?; - let timeline_info = pageserver.timeline_create( - tenant_id, - new_timeline_id_opt, - None, - None, - Some(pg_version), - None, - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + new_timeline_id_opt, + None, + None, + Some(pg_version), + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -542,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + pageserver + .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) + .await?; env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; println!("Creating endpoint for imported timeline ..."); @@ -578,14 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline_info = pageserver.timeline_create( - tenant_id, - None, - start_lsn, - Some(ancestor_timeline_id), - None, - None, - )?; + let timeline_info = pageserver + .timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + None, + None, + ) + .await?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -604,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Ok(()) } -fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match ep_match.subcommand() { Some(ep_subcommand_data) => ep_subcommand_data, None => bail!("no endpoint subcommand provided"), @@ -614,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( match sub_name { "list" => { let tenant_id = get_tenant_id(sub_args, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); + let timeline_infos = get_timeline_infos(env, &tenant_id) + .await + .unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); + HashMap::new() + }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -791,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( }; println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token, safekeepers, remote_ext_config)?; + endpoint + .start(&auth_token, safekeepers, remote_ext_config) + .await?; } "reconfigure" => { let endpoint_id = sub_args @@ -809,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( } else { None }; - endpoint.reconfigure(pageserver_id)?; + endpoint.reconfigure(pageserver_id).await?; } "stop" => { let endpoint_id = sub_args @@ -875,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { +async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { if let Err(e) = get_pageserver(env, subcommand_args)? .start(&pageserver_config_overrides(subcommand_args)) + .await { eprintln!("pageserver start failed: {e}"); exit(1); @@ -906,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul exit(1); } - if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(subcommand_args)) + .await + { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -920,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul exit(1); } - if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(subcommand_args)) + .await + { eprintln!("pageserver start failed: {e}"); exit(1); } } Some(("status", subcommand_args)) => { - match get_pageserver(env, subcommand_args)?.check_status() { + match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), Err(err) => { eprintln!("Page server is not available: {}", err); @@ -942,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_attachment_service( + sub_match: &ArgMatches, + env: &local_env::LocalEnv, +) -> Result<()> { let svc = AttachmentService::from_env(env); match sub_match.subcommand() { Some(("start", _start_match)) => { - if let Err(e) = svc.start() { + if let Err(e) = svc.start().await { eprintln!("start failed: {e}"); exit(1); } @@ -987,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec { .collect() } -fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match sub_match.subcommand() { Some(safekeeper_command_data) => safekeeper_command_data, None => bail!("no safekeeper subcommand provided"), @@ -1005,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul "start" => { let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts) { + if let Err(e) = safekeeper.start(extra_opts).await { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1031,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts) { + if let Err(e) = safekeeper.start(extra_opts).await { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1044,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { // Endpoints are not started automatically - broker::start_broker_process(env)?; + broker::start_broker_process(env).await?; // Only start the attachment service if the pageserver is configured to need it if env.control_plane_api.is_some() { let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start() { + if let Err(e) = attachment_service.start().await { eprintln!("attachment_service start failed: {:#}", e); try_stop_all(env, true); exit(1); @@ -1061,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { + if let Err(e) = pageserver + .start(&pageserver_config_overrides(sub_match)) + .await + { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); try_stop_all(env, true); exit(1); @@ -1070,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![]) { + if let Err(e) = safekeeper.start(vec![]).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); try_stop_all(env, false); exit(1); diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index 6be865cc2e..f40705863b 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -11,7 +11,7 @@ use camino::Utf8PathBuf; use crate::{background_process, local_env}; -pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { +pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let broker = &env.broker; let listen_addr = &broker.listen_addr; @@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let args = [format!("--listen-addr={listen_addr}")]; - let client = reqwest::blocking::Client::new(); + let client = reqwest::Client::new(); background_process::start_process( "storage_broker", &env.base_data_dir, &env.storage_broker_bin(), args, [], - background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)), - || { + background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)), + || async { let url = broker.client_url(); let status_url = url.join("status").with_context(|| { format!("Failed to append /status path to broker endpoint {url}") @@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { .get(status_url) .build() .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?; - match client.execute(request) { + match client.execute(request).await { Ok(resp) => Ok(resp.status().is_success()), Err(_) => Ok(false), } }, ) + .await .context("Failed to spawn storage_broker subprocess")?; Ok(()) } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index a566f03db9..071f22dc2b 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -464,7 +464,7 @@ impl Endpoint { } } - pub fn start( + pub async fn start( &self, auth_token: &Option, safekeepers: Vec, @@ -587,7 +587,7 @@ impl Endpoint { const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s loop { attempt += 1; - match self.get_status() { + match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { @@ -629,8 +629,8 @@ impl Endpoint { } // Call the /status HTTP API - pub fn get_status(&self) -> Result { - let client = reqwest::blocking::Client::new(); + pub async fn get_status(&self) -> Result { + let client = reqwest::Client::new(); let response = client .request( @@ -641,16 +641,17 @@ impl Endpoint { self.http_address.port() ), ) - .send()?; + .send() + .await?; // Interpret the response let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { - Ok(response.json()?) + Ok(response.json().await?) } else { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = response.url().to_owned(); - let msg = match response.text() { + let msg = match response.text().await { Ok(err_body) => format!("Error: {}", err_body), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; @@ -658,7 +659,7 @@ impl Endpoint { } } - pub fn reconfigure(&self, pageserver_id: Option) -> Result<()> { + pub async fn reconfigure(&self, pageserver_id: Option) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -687,7 +688,7 @@ impl Endpoint { spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}")); } - let client = reqwest::blocking::Client::new(); + let client = reqwest::Client::new(); let response = client .post(format!( "http://{}:{}/configure", @@ -698,14 +699,15 @@ impl Endpoint { "{{\"spec\":{}}}", serde_json::to_string_pretty(&spec)? )) - .send()?; + .send() + .await?; let status = response.status(); if !(status.is_client_error() || status.is_server_error()) { Ok(()) } else { let url = response.url().to_owned(); - let msg = match response.text() { + let msg = match response.text().await { Ok(err_body) => format!("Error: {}", err_body), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 96a41874fd..7d490016bf 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -6,28 +6,24 @@ //! use std::borrow::Cow; use std::collections::HashMap; -use std::fs::File; -use std::io::{BufReader, Write}; + +use std::io; +use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::{Child, Command}; use std::time::Duration; -use std::{io, result}; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use pageserver_api::models::{ - self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo, -}; +use futures::SinkExt; +use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; -use reqwest::blocking::{Client, RequestBuilder, Response}; -use reqwest::{IntoUrl, Method}; -use thiserror::Error; use utils::auth::{Claims, Scope}; use utils::{ - http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; -#[derive(Error, Debug)] -pub enum PageserverHttpError { - #[error("Reqwest error: {0}")] - Transport(#[from] reqwest::Error), - - #[error("Error: {0}")] - Response(String), -} - -impl From for PageserverHttpError { - fn from(e: anyhow::Error) -> Self { - Self::Response(e.to_string()) - } -} - -type Result = result::Result; - -pub trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> Result; -} - -impl ResponseErrorMessageExt for Response { - fn error_from_body(self) -> Result { - let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { - return Ok(self); - } - - // reqwest does not export its error construction utility functions, so let's craft the message ourselves - let url = self.url().to_owned(); - Err(PageserverHttpError::Response( - match self.json::() { - Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), - }, - )) - } -} - // // Control routines for pageserver. // @@ -87,8 +44,7 @@ pub struct PageServerNode { pub pg_connection_config: PgConnectionConfig, pub conf: PageServerConf, pub env: LocalEnv, - pub http_client: Client, - pub http_base_url: String, + pub http_client: mgmt_api::Client, } impl PageServerNode { @@ -100,8 +56,19 @@ impl PageServerNode { pg_connection_config: PgConnectionConfig::new_host_port(host, port), conf: conf.clone(), env: env.clone(), - http_client: Client::new(), - http_base_url: format!("http://{}/v1", conf.listen_http_addr), + http_client: mgmt_api::Client::new( + format!("http://{}", conf.listen_http_addr), + { + match conf.http_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some( + env.generate_auth_token(&Claims::new(None, Scope::PageServerApi)) + .unwrap(), + ), + } + } + .as_deref(), + ), } } @@ -182,8 +149,8 @@ impl PageServerNode { .expect("non-Unicode path") } - pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, false) + pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result { + self.start_node(config_overrides, false).await } fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { @@ -224,7 +191,12 @@ impl PageServerNode { Ok(()) } - fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result { + async fn start_node( + &self, + config_overrides: &[&str], + update_config: bool, + ) -> anyhow::Result { + // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( "Starting pageserver node {} at '{}' in {:?}", @@ -232,7 +204,7 @@ impl PageServerNode { self.pg_connection_config.raw_address(), datadir ); - io::stdout().flush()?; + io::stdout().flush().context("flush stdout")?; let datadir_path_str = datadir.to_str().with_context(|| { format!( @@ -244,20 +216,23 @@ impl PageServerNode { if update_config { args.push(Cow::Borrowed("--update-config")); } - background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), args.iter().map(Cow::as_ref), self.pageserver_env_variables()?, - background_process::InitialPidFile::Expect(&self.pid_file()), - || match self.check_status() { - Ok(()) => Ok(true), - Err(PageserverHttpError::Transport(_)) => Ok(false), - Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + background_process::InitialPidFile::Expect(self.pid_file()), + || async { + let st = self.check_status().await; + match st { + Ok(()) => Ok(true), + Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + } }, ) + .await } fn pageserver_basic_args<'a>( @@ -303,7 +278,12 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub fn page_server_psql_client(&self) -> anyhow::Result { + pub async fn page_server_psql_client( + &self, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { let mut config = self.pg_connection_config.clone(); if self.conf.pg_auth_type == AuthType::NeonJWT { let token = self @@ -311,36 +291,18 @@ impl PageServerNode { .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; config = config.set_password(Some(token)); } - Ok(config.connect_no_tls()?) + Ok(config.connect_no_tls().await?) } - fn http_request(&self, method: Method, url: U) -> anyhow::Result { - let mut builder = self.http_client.request(method, url); - if self.conf.http_auth_type == AuthType::NeonJWT { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - builder = builder.bearer_auth(token) - } - Ok(builder) + pub async fn check_status(&self) -> mgmt_api::Result<()> { + self.http_client.status().await } - pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/status", self.http_base_url))? - .send()? - .error_from_body()?; - Ok(()) + pub async fn tenant_list(&self) -> mgmt_api::Result> { + self.http_client.list_tenants().await } - pub fn tenant_list(&self) -> Result> { - Ok(self - .http_request(Method::GET, format!("{}/tenant", self.http_base_url))? - .send()? - .error_from_body()? - .json()?) - } - - pub fn tenant_create( + pub async fn tenant_create( &self, new_tenant_id: TenantId, generation: Option, @@ -407,6 +369,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), }; let request = models::TenantCreateRequest { @@ -417,23 +380,10 @@ impl PageServerNode { if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))? - .json(&request) - .send()? - .error_from_body()? - .json::>() - .with_context(|| { - format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") - })? - .context("No tenant id was found in the tenant creation response") - .and_then(|tenant_id_string| { - tenant_id_string.parse().with_context(|| { - format!("Failed to parse response string as tenant id: '{tenant_id_string}'") - }) - }) + Ok(self.http_client.tenant_create(&request).await?) } - pub fn tenant_config( + pub async fn tenant_config( &self, tenant_id: TenantId, mut settings: HashMap<&str, &str>, @@ -504,6 +454,7 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_feedback' as bool")?, + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), } }; @@ -511,54 +462,30 @@ impl PageServerNode { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? - .json(&models::TenantConfigRequest { tenant_id, config }) - .send()? - .error_from_body()?; + self.http_client + .tenant_config(&models::TenantConfigRequest { tenant_id, config }) + .await?; Ok(()) } - pub fn location_config( + pub async fn location_config( &self, tenant_id: TenantId, config: LocationConfig, flush_ms: Option, ) -> anyhow::Result<()> { - let req_body = TenantLocationConfigRequest { tenant_id, config }; - - let path = format!( - "{}/tenant/{}/location_config", - self.http_base_url, tenant_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - - self.http_request(Method::PUT, path)? - .json(&req_body) - .send()? - .error_from_body()?; - - Ok(()) + Ok(self + .http_client + .location_config(tenant_id, config, flush_ms) + .await?) } - pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { - let timeline_infos: Vec = self - .http_request( - Method::GET, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - )? - .send()? - .error_from_body()? - .json()?; - - Ok(timeline_infos) + pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { + Ok(self.http_client.list_timelines(*tenant_id).await?) } - pub fn timeline_create( + pub async fn timeline_create( &self, tenant_id: TenantId, new_timeline_id: Option, @@ -569,29 +496,14 @@ impl PageServerNode { ) -> anyhow::Result { // If timeline ID was not specified, generate one let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); - - self.http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - )? - .json(&models::TimelineCreateRequest { + let req = models::TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, pg_version, existing_initdb_timeline_id, - }) - .send()? - .error_from_body()? - .json::>() - .with_context(|| { - format!("Failed to parse timeline creation response for tenant id: {tenant_id}") - })? - .with_context(|| { - format!( - "No timeline id was found in the timeline creation response for tenant {tenant_id}" - ) - }) + }; + Ok(self.http_client.timeline_create(tenant_id, &req).await?) } /// Import a basebackup prepared using either: @@ -603,7 +515,7 @@ impl PageServerNode { /// * `timeline_id` - id to assign to imported timeline /// * `base` - (start lsn of basebackup, path to `base.tar` file) /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) - pub fn timeline_import( + pub async fn timeline_import( &self, tenant_id: TenantId, timeline_id: TimelineId, @@ -611,36 +523,60 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let mut client = self.page_server_psql_client()?; + let (client, conn) = self.page_server_psql_client().await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = conn.await { + eprintln!("connection error: {}", e); + } + }); + tokio::pin!(client); // Init base reader let (start_lsn, base_tarfile_path) = base; - let base_tarfile = File::open(base_tarfile_path)?; - let mut base_reader = BufReader::new(base_tarfile); + let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; + let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { - let wal_tarfile = File::open(wal_tarfile_path)?; - let wal_reader = BufReader::new(wal_tarfile); + let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; + let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; - // Import base - let import_cmd = format!( - "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" - ); - let mut writer = client.copy_in(&import_cmd)?; - io::copy(&mut base_reader, &mut writer)?; - writer.finish()?; + let copy_in = |reader, cmd| { + let client = &client; + async move { + let writer = client.copy_in(&cmd).await?; + let writer = std::pin::pin!(writer); + let mut writer = writer.sink_map_err(|e| { + std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")) + }); + let mut reader = std::pin::pin!(reader); + writer.send_all(&mut reader).await?; + writer.into_inner().finish().await?; + anyhow::Ok(()) + } + }; + // Import base + copy_in( + base_tarfile, + format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ), + ) + .await?; // Import wal if necessary - if let Some(mut wal_reader) = wal_reader { - let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); - let mut writer = client.copy_in(&import_cmd)?; - io::copy(&mut wal_reader, &mut writer)?; - writer.finish()?; + if let Some(wal_reader) = wal_reader { + copy_in( + wal_reader, + format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"), + ) + .await?; } Ok(()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index a8baa0ac53..4026ef0eb9 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -13,7 +13,6 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; -use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{http::error::HttpErrorBody, id::NodeId}; @@ -34,12 +33,14 @@ pub enum SafekeeperHttpError { type Result = result::Result; +#[async_trait::async_trait] pub trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> Result; + async fn error_from_body(self) -> Result; } -impl ResponseErrorMessageExt for Response { - fn error_from_body(self) -> Result { +#[async_trait::async_trait] +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { let status = self.status(); if !(status.is_client_error() || status.is_server_error()) { return Ok(self); @@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(SafekeeperHttpError::Response( - match self.json::() { + match self.json::().await { Ok(err_body) => format!("Error: {}", err_body.msg), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, @@ -69,7 +70,7 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, - pub http_client: Client, + pub http_client: reqwest::Client, pub http_base_url: String, } @@ -80,7 +81,7 @@ impl SafekeeperNode { conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), - http_client: Client::new(), + http_client: reqwest::Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), } } @@ -103,7 +104,7 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub fn start(&self, extra_opts: Vec) -> anyhow::Result { + pub async fn start(&self, extra_opts: Vec) -> anyhow::Result { print!( "Starting safekeeper at '{}' in '{}'", self.pg_connection_config.raw_address(), @@ -191,13 +192,16 @@ impl SafekeeperNode { &self.env.safekeeper_bin(), &args, [], - background_process::InitialPidFile::Expect(&self.pid_file()), - || match self.check_status() { - Ok(()) => Ok(true), - Err(SafekeeperHttpError::Transport(_)) => Ok(false), - Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + background_process::InitialPidFile::Expect(self.pid_file()), + || async { + match self.check_status().await { + Ok(()) => Ok(true), + Err(SafekeeperHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + } }, ) + .await } /// @@ -216,7 +220,7 @@ impl SafekeeperNode { ) } - fn http_request(&self, method: Method, url: U) -> RequestBuilder { + fn http_request(&self, method: Method, url: U) -> reqwest::RequestBuilder { // TODO: authentication //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) @@ -224,10 +228,12 @@ impl SafekeeperNode { self.http_client.request(method, url) } - pub fn check_status(&self) -> Result<()> { + pub async fn check_status(&self) -> Result<()> { self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) - .send()? - .error_from_body()?; + .send() + .await? + .error_from_body() + .await?; Ok(()) } } diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index fbb0358158..79df108896 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -19,11 +19,11 @@ use utils::{ }; /// Given an attached pageserver, retrieve the LSN for all timelines -fn get_lsns( +async fn get_lsns( tenant_id: TenantId, pageserver: &PageServerNode, ) -> anyhow::Result> { - let timelines = pageserver.timeline_list(&tenant_id)?; + let timelines = pageserver.timeline_list(&tenant_id).await?; Ok(timelines .into_iter() .map(|t| (t.timeline_id, t.last_record_lsn)) @@ -32,13 +32,13 @@ fn get_lsns( /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake /// `baseline`. -fn await_lsn( +async fn await_lsn( tenant_id: TenantId, pageserver: &PageServerNode, baseline: HashMap, ) -> anyhow::Result<()> { loop { - let latest = match get_lsns(tenant_id, pageserver) { + let latest = match get_lsns(tenant_id, pageserver).await { Ok(l) => l, Err(e) => { println!( @@ -84,7 +84,7 @@ fn await_lsn( /// - Coordinate attach/secondary/detach on pageservers /// - call into attachment_service for generations /// - reconfigure compute endpoints to point to new attached pageserver -pub fn migrate_tenant( +pub async fn migrate_tenant( env: &LocalEnv, tenant_id: TenantId, dest_ps: PageServerNode, @@ -108,16 +108,18 @@ pub fn migrate_tenant( } } - let previous = attachment_service.inspect(tenant_id)?; + let previous = attachment_service.inspect(tenant_id).await?; let mut baseline_lsns = None; if let Some((generation, origin_ps_id)) = &previous { let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?); if origin_ps_id == &dest_ps.conf.id { println!("🔁 Already attached to {origin_ps_id}, freshening..."); - let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; + let gen = attachment_service + .attach_hook(tenant_id, dest_ps.conf.id) + .await?; let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; println!("✅ Migration complete"); return Ok(()); } @@ -126,20 +128,24 @@ pub fn migrate_tenant( let stale_conf = build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None); - origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?; + origin_ps + .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10))) + .await?; - baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?); + baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?); } - let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; + let gen = attachment_service + .attach_hook(tenant_id, dest_ps.conf.id) + .await?; let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None); println!("🔁 Attaching to pageserver {}", dest_ps.conf.id); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; if let Some(baseline) = baseline_lsns { println!("🕑 Waiting for LSN to catch up..."); - await_lsn(tenant_id, &dest_ps, baseline)?; + await_lsn(tenant_id, &dest_ps, baseline).await?; } let cplane = ComputeControlPlane::load(env.clone())?; @@ -149,7 +155,7 @@ pub fn migrate_tenant( "🔁 Reconfiguring endpoint {} to use pageserver {}", endpoint_name, dest_ps.conf.id ); - endpoint.reconfigure(Some(dest_ps.conf.id))?; + endpoint.reconfigure(Some(dest_ps.conf.id)).await?; } } @@ -159,7 +165,7 @@ pub fn migrate_tenant( } let other_ps = PageServerNode::from_env(env, other_ps_conf); - let other_ps_tenants = other_ps.tenant_list()?; + let other_ps_tenants = other_ps.tenant_list().await?; // Check if this tenant is attached let found = other_ps_tenants @@ -181,7 +187,9 @@ pub fn migrate_tenant( "💤 Switching to secondary mode on pageserver {}", other_ps.conf.id ); - other_ps.location_config(tenant_id, secondary_conf, None)?; + other_ps + .location_config(tenant_id, secondary_conf, None) + .await?; } println!( @@ -189,7 +197,7 @@ pub fn migrate_tenant( dest_ps.conf.id ); let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None)?; + dest_ps.location_config(tenant_id, dest_conf, None).await?; println!("✅ Migration complete"); diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index ed375a152f..d09ba11344 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,8 +3,11 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] + use once_cell::sync::Lazy; -use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; +use prometheus::core::{ + Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, +}; pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; @@ -132,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage { rusage.assume_init() } } + +/// Create an [`IntCounterPairVec`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_int_counter_pair_vec { + ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{ + match ( + $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES), + $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES), + ) { + (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)), + (Err(e), _) | (_, Err(e)) => Err(e), + } + }}; +} +/// Create an [`IntCounterPair`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_int_counter_pair { + ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{ + match ( + $crate::register_int_counter!($NAME1, $HELP1), + $crate::register_int_counter!($NAME2, $HELP2), + ) { + (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)), + (Err(e), _) | (_, Err(e)) => Err(e), + } + }}; +} + +/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes +pub struct GenericCounterPairVec { + inc: GenericCounterVec

, + dec: GenericCounterVec

, +} + +/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes +pub struct GenericCounterPair { + inc: GenericCounter

, + dec: GenericCounter

, +} + +impl GenericCounterPairVec

{ + pub fn new(inc: GenericCounterVec

, dec: GenericCounterVec

) -> Self { + Self { inc, dec } + } + + /// `get_metric_with_label_values` returns the [`GenericCounterPair

`] for the given slice + /// of label values (same order as the VariableLabels in Desc). If that combination of + /// label values is accessed for the first time, a new [`GenericCounterPair

`] is created. + /// + /// An error is returned if the number of label values is not the same as the + /// number of VariableLabels in Desc. + pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + Ok(GenericCounterPair { + inc: self.inc.get_metric_with_label_values(vals)?, + dec: self.dec.get_metric_with_label_values(vals)?, + }) + } + + /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error + /// occurs. + pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ + self.get_metric_with_label_values(vals).unwrap() + } +} + +impl GenericCounterPair

{ + pub fn new(inc: GenericCounter

, dec: GenericCounter

) -> Self { + Self { inc, dec } + } + + /// Increment the gauge by 1, returning a guard that decrements by 1 on drop. + pub fn guard(&self) -> GenericCounterPairGuard

{ + self.inc.inc(); + GenericCounterPairGuard(self.dec.clone()) + } + + /// Increment the gauge by n, returning a guard that decrements by n on drop. + pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy

{ + self.inc.inc_by(n); + GenericCounterPairGuardBy(self.dec.clone(), n) + } + + /// Increase the gauge by 1. + #[inline] + pub fn inc(&self) { + self.inc.inc(); + } + + /// Decrease the gauge by 1. + #[inline] + pub fn dec(&self) { + self.dec.inc(); + } + + /// Add the given value to the gauge. (The value can be + /// negative, resulting in a decrement of the gauge.) + #[inline] + pub fn inc_by(&self, v: P::T) { + self.inc.inc_by(v); + } + + /// Subtract the given value from the gauge. (The value can be + /// negative, resulting in an increment of the gauge.) + #[inline] + pub fn dec_by(&self, v: P::T) { + self.dec.inc_by(v); + } +} + +/// Guard returned by [`GenericCounterPair::guard`] +pub struct GenericCounterPairGuard(GenericCounter

); + +impl Drop for GenericCounterPairGuard

{ + fn drop(&mut self) { + self.0.inc(); + } +} +/// Guard returned by [`GenericCounterPair::guard_by`] +pub struct GenericCounterPairGuardBy(GenericCounter

, P::T); + +impl Drop for GenericCounterPairGuardBy

{ + fn drop(&mut self) { + self.0.inc_by(self.1); + } +} + +/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes +pub type IntCounterPairVec = GenericCounterPairVec; + +/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes +pub type IntCounterPair = GenericCounterPair; + +/// A guard for [`IntCounterPair`] that will decrement the gauge on drop +pub type IntCounterPairGuard = GenericCounterPairGuard; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 4d08d78e87..4146597d8d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -24,3 +24,4 @@ workspace_hack.workspace = true [dev-dependencies] bincode.workspace = true +rand.workspace = true diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 16715bc667..d680a5600e 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -144,3 +144,37 @@ impl Key { pub fn is_rel_block_key(key: &Key) -> bool { key.field1 == 0x00 && key.field4 != 0 } + +impl std::str::FromStr for Key { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::result::Result { + Self::from_hex(s) + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use crate::key::Key; + + use rand::Rng; + use rand::SeedableRng; + + #[test] + fn display_fromstr_bijection() { + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + + let key = Key { + field1: rng.gen(), + field2: rng.gen(), + field3: rng.gen(), + field4: rng.gen(), + field5: rng.gen(), + field6: rng.gen(), + }; + + assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); + } +} diff --git a/pageserver/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs similarity index 93% rename from pageserver/src/keyspace.rs rename to libs/pageserver_api/src/keyspace.rs index 20e6df9c7b..16651c322e 100644 --- a/pageserver/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,12 @@ -use crate::repository::{key_range_size, singleton_range, Key}; use postgres_ffi::BLCKSZ; use std::ops::Range; +use crate::key::Key; + /// /// Represents a set of Keys, in a compact form. /// -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct KeySpace { /// Contiguous ranges of keys that belong to the key space. In key order, /// and with no overlap. @@ -186,6 +187,33 @@ impl KeySpaceRandomAccum { } } +pub fn key_range_size(key_range: &Range) -> u32 { + let start = key_range.start; + let end = key_range.end; + + if end.field1 != start.field1 + || end.field2 != start.field2 + || end.field3 != start.field3 + || end.field4 != start.field4 + { + return u32::MAX; + } + + let start = (start.field5 as u64) << 32 | start.field6 as u64; + let end = (end.field5 as u64) << 32 | end.field6 as u64; + + let diff = end - start; + if diff > u32::MAX as u64 { + u32::MAX + } else { + diff as u32 + } +} + +pub fn singleton_range(key: Key) -> Range { + key..key.next() +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 511c5ed208..b236b93428 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -5,6 +5,7 @@ use const_format::formatcp; /// Public API types pub mod control_api; pub mod key; +pub mod keyspace; pub mod models; pub mod reltag; pub mod shard; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a3029e67a5..a78ba8ad94 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,3 +1,5 @@ +pub mod partitioning; + use std::{ collections::HashMap, num::{NonZeroU64, NonZeroUsize}, @@ -237,6 +239,7 @@ pub struct TenantConfig { pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, pub gc_feedback: Option, + pub heatmap_period: Option, } /// A flattened analog of a `pagesever::tenant::LocationMode`, which diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs new file mode 100644 index 0000000000..0d287f7be0 --- /dev/null +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -0,0 +1,151 @@ +use utils::lsn::Lsn; + +#[derive(Debug, PartialEq, Eq)] +pub struct Partitioning { + pub keys: crate::keyspace::KeySpace, + + pub at_lsn: Lsn, +} + +impl serde::Serialize for Partitioning { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace); + + impl<'a> serde::Serialize for KeySpace<'a> { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeSeq; + let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?; + for kr in &self.0.ranges { + seq.serialize_element(&KeyRange(kr))?; + } + seq.end() + } + } + + use serde::ser::SerializeMap; + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_key("keys")?; + map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("at_lsn")?; + map.serialize_value(&WithDisplay(&self.at_lsn))?; + map.end() + } +} + +pub struct WithDisplay<'a, T>(&'a T); + +impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.0) + } +} + +pub struct KeyRange<'a>(&'a std::ops::Range); + +impl<'a> serde::Serialize for KeyRange<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeTuple; + let mut t = serializer.serialize_tuple(2)?; + t.serialize_element(&WithDisplay(&self.0.start))?; + t.serialize_element(&WithDisplay(&self.0.end))?; + t.end() + } +} + +impl<'a> serde::Deserialize<'a> for Partitioning { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'a>, + { + pub struct KeySpace(crate::keyspace::KeySpace); + + impl<'de> serde::Deserialize<'de> for KeySpace { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + #[serde(transparent)] + struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key); + + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + struct Range(Key, Key); + + let ranges: Vec = serde::Deserialize::deserialize(deserializer)?; + Ok(Self(crate::keyspace::KeySpace { + ranges: ranges + .into_iter() + .map(|Range(start, end)| (start.0..end.0)) + .collect(), + })) + } + } + + #[serde_with::serde_as] + #[derive(serde::Deserialize)] + struct De { + keys: KeySpace, + #[serde_as(as = "serde_with::DisplayFromStr")] + at_lsn: Lsn, + } + + let de: De = serde::Deserialize::deserialize(deserializer)?; + Ok(Self { + at_lsn: de.at_lsn, + keys: de.keys.0, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_serialization_roundtrip() { + let reference = r#" + { + "keys": [ + [ + "000000000000000000000000000000000000", + "000000000000000000000000000000000001" + ], + [ + "000000067F00000001000000000000000000", + "000000067F00000001000000000000000002" + ], + [ + "030000000000000000000000000000000000", + "030000000000000000000000000000000003" + ] + ], + "at_lsn": "0/2240160" + } + "#; + + let de: Partitioning = serde_json::from_str(reference).unwrap(); + + let ser = serde_json::to_string(&de).unwrap(); + + let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap(); + + assert_eq!( + ser_de, + serde_json::from_str::<'_, serde_json::Value>(reference).unwrap() + ); + } +} diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index 35cb1a2691..ccf9108895 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -163,8 +163,18 @@ impl PgConnectionConfig { } /// Connect using postgres protocol with TLS disabled. - pub fn connect_no_tls(&self) -> Result { - postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls) + pub async fn connect_no_tls( + &self, + ) -> Result< + ( + tokio_postgres::Client, + tokio_postgres::Connection, + ), + postgres::Error, + > { + self.to_tokio_postgres_config() + .connect(postgres::NoTls) + .await } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ccf6f4f2d7..af0414daa2 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -50,6 +50,8 @@ const_format.workspace = true # why is it only here? no other crate should use it, streams are rarely needed. tokio-stream = { version = "0.1.14" } +serde_path_to_error.workspace = true + [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 70e682cb76..7ca62561fe 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body Deserialize<'de>>( if body.remaining() == 0 { return Ok(None); } - serde_json::from_reader(body.reader()) - .context("Failed to parse json request") + + let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); + + serde_path_to_error::deserialize(&mut deser) + // intentionally stringify because the debug version is not helpful in python logs + .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) .map(Some) .map_err(ApiError::BadRequest) } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2f09c2f3ea..f7b73dc984 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use anyhow::Context; +use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; @@ -24,16 +25,48 @@ impl LogFormat { } } -static TRACING_EVENT_COUNT: Lazy = Lazy::new(|| { - metrics::register_int_counter_vec!( +struct TracingEventCountMetric { + error: IntCounter, + warn: IntCounter, + info: IntCounter, + debug: IntCounter, + trace: IntCounter, +} + +static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { + let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", &["level"] ) - .expect("failed to define metric") + .expect("failed to define metric"); + TracingEventCountMetric::new(vec) }); -struct TracingEventCountLayer(&'static metrics::IntCounterVec); +impl TracingEventCountMetric { + fn new(vec: IntCounterVec) -> Self { + Self { + error: vec.with_label_values(&["error"]), + warn: vec.with_label_values(&["warn"]), + info: vec.with_label_values(&["info"]), + debug: vec.with_label_values(&["debug"]), + trace: vec.with_label_values(&["trace"]), + } + } + + fn inc_for_level(&self, level: tracing::Level) { + let counter = match level { + tracing::Level::ERROR => &self.error, + tracing::Level::WARN => &self.warn, + tracing::Level::INFO => &self.info, + tracing::Level::DEBUG => &self.debug, + tracing::Level::TRACE => &self.trace, + }; + counter.inc(); + } +} + +struct TracingEventCountLayer(&'static TracingEventCountMetric); impl tracing_subscriber::layer::Layer for TracingEventCountLayer where @@ -44,15 +77,7 @@ where event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { - let level = event.metadata().level(); - let level = match *level { - tracing::Level::ERROR => "error", - tracing::Level::WARN => "warn", - tracing::Level::INFO => "info", - tracing::Level::DEBUG => "debug", - tracing::Level::TRACE => "trace", - }; - self.0.with_label_values(&[level]).inc(); + self.0.inc_for_level(*event.metadata().level()); } } @@ -106,7 +131,9 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); - let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + let r = r.with( + TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), + ); match tracing_error_layer_enablement { TracingErrorLayerEnablement::EnableWithRustLogFilter => r .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) @@ -257,14 +284,14 @@ impl std::fmt::Debug for SecretString { mod tests { use metrics::{core::Opts, IntCounterVec}; - use super::TracingEventCountLayer; + use crate::logging::{TracingEventCountLayer, TracingEventCountMetric}; #[test] fn tracing_event_count_metric() { let counter_vec = IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); - let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static - let layer = TracingEventCountLayer(counter_vec); + let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone()))); + let layer = TracingEventCountLayer(metric); use tracing_subscriber::prelude::*; tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { diff --git a/libs/utils/src/timeout.rs b/libs/utils/src/timeout.rs index 11fa417242..56bf57a900 100644 --- a/libs/utils/src/timeout.rs +++ b/libs/utils/src/timeout.rs @@ -2,8 +2,11 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; +#[derive(thiserror::Error, Debug)] pub enum TimeoutCancellableError { + #[error("Timed out")] Timeout, + #[error("Cancelled")] Cancelled, } diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index d32c8ab299..fd09030dbd 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -1,3 +1,6 @@ +//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h +//! to generate Rust bindings for it. + use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 7f1bbc3b80..77afe1e686 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -1,3 +1,6 @@ +//! A C-Rust shim: defines implementation of C walproposer API, assuming wp +//! callback_data stores Box to some Rust implementation. + #![allow(dead_code)] use std::ffi::CStr; diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 0661d3a969..f5723018d7 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -436,9 +436,9 @@ mod tests { event_mask: 0, }), expected_messages: vec![ - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, @@ -478,7 +478,7 @@ mod tests { // walproposer will panic when it finishes sync_safekeepers std::panic::catch_unwind(|| wp.start()).unwrap_err(); // validate the resulting LSN - assert_eq!(receiver.recv()?, 1337); + assert_eq!(receiver.try_recv(), Ok(1337)); Ok(()) // drop() will free up resources here } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 35c260740c..9e8172c6a1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -36,6 +36,7 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true +md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses num_cpus = { version = "1.15" } diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml new file mode 100644 index 0000000000..4bd36185a6 --- /dev/null +++ b/pageserver/client/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "pageserver_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +thiserror.workspace = true +async-trait.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs new file mode 100644 index 0000000000..3963fd466c --- /dev/null +++ b/pageserver/client/src/lib.rs @@ -0,0 +1 @@ +pub mod mgmt_api; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs new file mode 100644 index 0000000000..77eb1bb8e2 --- /dev/null +++ b/pageserver/client/src/mgmt_api.rs @@ -0,0 +1,188 @@ +use pageserver_api::models::*; +use reqwest::{IntoUrl, Method}; +use utils::{ + http::error::HttpErrorBody, + id::{TenantId, TimelineId}, +}; + +#[derive(Debug)] +pub struct Client { + mgmt_api_endpoint: String, + authorization_header: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("receive body: {0}")] + ReceiveBody(reqwest::Error), + + #[error("receive error body: {0}")] + ReceiveErrorBody(String), + + #[error("pageserver API: {0}")] + ApiError(String), +} + +pub type Result = std::result::Result; + +#[async_trait::async_trait] +pub trait ResponseErrorMessageExt: Sized { + async fn error_from_body(self) -> Result; +} + +#[async_trait::async_trait] +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(mut self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url)) + } + }) + } +} + +impl Client { + pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + mgmt_api_endpoint, + authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), + client: reqwest::Client::new(), + } + } + + pub async fn list_tenants(&self) -> Result> { + let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn list_timelines( + &self, + tenant_id: TenantId, + ) -> Result> { + let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn timeline_info( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn keyspace( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + self.mgmt_api_endpoint + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + async fn get(&self, uri: U) -> Result { + self.request(Method::GET, uri, ()).await + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let req = self.client.request(method, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?; + let response = res.error_from_body().await?; + Ok(response) + } + + pub async fn status(&self) -> Result<()> { + let uri = format!("{}/v1/status", self.mgmt_api_endpoint); + self.get(&uri).await?; + Ok(()) + } + + pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result { + let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { + let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); + self.request(Method::PUT, &uri, req).await?; + Ok(()) + } + + pub async fn location_config( + &self, + tenant_id: TenantId, + config: LocationConfig, + flush_ms: Option, + ) -> Result<()> { + let req_body = TenantLocationConfigRequest { tenant_id, config }; + let path = format!( + "{}/v1/tenant/{}/location_config", + self.mgmt_api_endpoint, tenant_id + ); + let path = if let Some(flush_ms) = flush_ms { + format!("{}?flush_ms={}", path, flush_ms.as_millis()) + } else { + path + }; + self.request(Method::PUT, &path, &req_body).await?; + Ok(()) + } + + pub async fn timeline_create( + &self, + tenant_id: TenantId, + req: &TimelineCreateRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline", + self.mgmt_api_endpoint, tenant_id + ); + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7607119dda..f65c4f4580 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; -use pageserver::tenant::TenantSharedResources; +use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; use tokio::time::Instant; use tracing::*; @@ -504,6 +504,17 @@ fn start_pageserver( } }); + let secondary_controller = if let Some(remote_storage) = &remote_storage { + secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ) + } else { + secondary::null_controller() + }; + // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has @@ -533,6 +544,7 @@ fn start_pageserver( broker_client.clone(), disk_usage_eviction_state, deletion_queue.new_client(), + secondary_controller, ) .context("Failed to initialize router state")?, ); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 13d1fc775b..bd63c4d860 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -41,6 +41,8 @@ use crate::{ TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; +use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; + pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; @@ -61,6 +63,8 @@ pub mod defaults { pub const DEFAULT_LOG_FORMAT: &str = "plain"; + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); @@ -70,6 +74,8 @@ pub mod defaults { pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + /// /// Default built-in configuration file. /// @@ -92,6 +98,7 @@ pub mod defaults { #log_format = '{DEFAULT_LOG_FORMAT}' #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' +#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' @@ -117,6 +124,8 @@ pub mod defaults { #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #gc_feedback = false +#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} + [remote_storage] "# @@ -176,6 +185,11 @@ pub struct PageServerConf { pub log_format: LogFormat, + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, + /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes + /// loading such tenants, vs. other work in the system. + pub concurrent_tenant_warmup: ConfigurableSemaphore, + /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. @@ -215,6 +229,10 @@ pub struct PageServerConf { /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, + + /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize + /// heatmap uploads vs. other remote storage operations. + pub heatmap_upload_concurrency: usize, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -275,6 +293,7 @@ struct PageServerConfigBuilder { log_format: BuilderValue, + concurrent_tenant_warmup: BuilderValue, concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, @@ -293,6 +312,8 @@ struct PageServerConfigBuilder { control_plane_api: BuilderValue>, control_plane_api_token: BuilderValue>, control_plane_emergency_mode: BuilderValue, + + heatmap_upload_concurrency: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -330,6 +351,8 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), concurrent_tenant_size_logical_size_queries: Set( ConfigurableSemaphore::DEFAULT_INITIAL, ), @@ -361,6 +384,8 @@ impl Default for PageServerConfigBuilder { control_plane_api: Set(None), control_plane_api_token: Set(None), control_plane_emergency_mode: Set(false), + + heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), } } } @@ -441,6 +466,10 @@ impl PageServerConfigBuilder { self.log_format = BuilderValue::Set(log_format) } + pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { + self.concurrent_tenant_warmup = BuilderValue::Set(u); + } + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } @@ -501,7 +530,14 @@ impl PageServerConfigBuilder { self.control_plane_emergency_mode = BuilderValue::Set(enabled) } + pub fn heatmap_upload_concurrency(&mut self, value: usize) { + self.heatmap_upload_concurrency = BuilderValue::Set(value) + } + pub fn build(self) -> anyhow::Result { + let concurrent_tenant_warmup = self + .concurrent_tenant_warmup + .ok_or(anyhow!("missing concurrent_tenant_warmup"))?; let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries .ok_or(anyhow!( @@ -554,6 +590,7 @@ impl PageServerConfigBuilder { .broker_keepalive_interval .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( concurrent_tenant_size_logical_size_queries, ), @@ -595,6 +632,10 @@ impl PageServerConfigBuilder { control_plane_emergency_mode: self .control_plane_emergency_mode .ok_or(anyhow!("missing control_plane_emergency_mode"))?, + + heatmap_upload_concurrency: self + .heatmap_upload_concurrency + .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, }) } } @@ -787,6 +828,11 @@ impl PageServerConf { "log_format" => builder.log_format( LogFormat::from_config(&parse_toml_string(key, item)?)? ), + "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ + let input = parse_toml_string(key, item)?; + let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; + NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? + }), "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ let input = parse_toml_string(key, item)?; let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; @@ -828,7 +874,9 @@ impl PageServerConf { }, "control_plane_emergency_mode" => { builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - + }, + "heatmap_upload_concurrency" => { + builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) }, _ => bail!("unrecognized pageserver option '{key}'"), } @@ -882,6 +930,10 @@ impl PageServerConf { broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant"), + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( ), @@ -896,6 +948,7 @@ impl PageServerConf { control_plane_api: None, control_plane_api_token: None, control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, } } } @@ -1099,6 +1152,9 @@ background_task_maximum_delay = '334 s' storage_broker::DEFAULT_KEEPALIVE_INTERVAL )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), @@ -1120,7 +1176,8 @@ background_task_maximum_delay = '334 s' )?, control_plane_api: None, control_plane_api_token: None, - control_plane_emergency_mode: false + control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY }, "Correct defaults should be used when no config values are provided" ); @@ -1164,6 +1221,9 @@ background_task_maximum_delay = '334 s' broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, + concurrent_tenant_warmup: ConfigurableSemaphore::new( + NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() + ), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), @@ -1177,7 +1237,8 @@ background_task_maximum_delay = '334 s' background_task_maximum_delay: Duration::from_secs(334), control_plane_api: None, control_plane_api_token: None, - control_plane_emergency_mode: false + control_plane_emergency_mode: false, + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 8f2b88d191..bde2cedca7 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -3,7 +3,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError}; +use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; @@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker( info!("calculate_synthetic_size_worker stopped"); }; - let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; - loop { let started_at = Instant::now(); @@ -280,29 +278,14 @@ async fn calculate_synthetic_size_worker( continue; } - if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) { - // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? - // We can put in some prioritization for consumption metrics. - // Same for the loop that fetches computed metrics. - // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, - // which turns out is really handy to understand the system. - if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await { - // this error can be returned if timeline is shutting down, but it does not - // mean the synthetic size worker should terminate. we do not need any checks - // in this function because `mgr::get_tenant` will error out after shutdown has - // progressed to shutting down tenants. - let is_cancelled = matches!( - e.downcast_ref::(), - Some(PageReconstructError::Cancelled) - ); + let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + continue; + }; - if !is_cancelled { - error!( - "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" - ); - } - } - } + // there is never any reason to exit calculate_synthetic_size_worker following any + // return value -- we don't need to care about shutdown because no tenant is found when + // pageserver is shut down. + calculate_and_log(&tenant, cancel, ctx).await; } crate::tenant::tasks::warn_when_period_overrun( @@ -321,3 +304,31 @@ async fn calculate_synthetic_size_worker( } } } + +async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) { + const CAUSE: LogicalSizeCalculationCause = + LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; + + // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. + let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else { + return; + }; + + // this error can be returned if timeline is shutting down, but it does not + // mean the synthetic size worker should terminate. we do not need any checks + // in this function because `mgr::get_tenant` will error out after shutdown has + // progressed to shutting down tenants. + let shutting_down = matches!( + e.downcast_ref::(), + Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) + ); + + if !shutting_down { + let tenant_shard_id = tenant.tenant_shard_id(); + error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + } +} diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f01cd1cf8c..76906cfaf7 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -42,7 +42,6 @@ // reading these fields. We use the Debug impl for semi-structured logging, though. use std::{ - collections::HashMap, sync::Arc, time::{Duration, SystemTime}, }; @@ -125,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task( async fn disk_usage_eviction_task( state: &State, task_config: &DiskUsageEvictionTaskConfig, - _storage: &GenericRemoteStorage, + storage: &GenericRemoteStorage, tenants_dir: &Utf8Path, cancel: CancellationToken, ) { @@ -149,8 +148,14 @@ async fn disk_usage_eviction_task( let start = Instant::now(); async { - let res = - disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await; + let res = disk_usage_eviction_task_iteration( + state, + task_config, + storage, + tenants_dir, + &cancel, + ) + .await; match res { Ok(()) => {} @@ -181,12 +186,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug { async fn disk_usage_eviction_task_iteration( state: &State, task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, tenants_dir: &Utf8Path, cancel: &CancellationToken, ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -268,8 +274,9 @@ struct LayerCount { count: usize, } -pub async fn disk_usage_eviction_task_iteration_impl( +pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, + _storage: &GenericRemoteStorage, usage_pre: U, cancel: &CancellationToken, ) -> anyhow::Result> { @@ -321,16 +328,16 @@ pub async fn disk_usage_eviction_task_iteration_impl( // Walk through the list of candidates, until we have accumulated enough layers to get // us back under the pressure threshold. 'usage_planned' is updated so that it tracks // how much disk space would be used after evicting all the layers up to the current - // point in the list. The layers are collected in 'batched', grouped per timeline. + // point in the list. // // If we get far enough in the list that we start to evict layers that are below // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let mut batched: HashMap<_, Vec<_>> = HashMap::new(); let mut warned = None; let mut usage_planned = usage_pre; - let mut max_batch_size = 0; - for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + let mut evicted_amount = 0; + + for (i, (partition, candidate)) in candidates.iter().enumerate() { if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, @@ -339,25 +346,13 @@ pub async fn disk_usage_eviction_task_iteration_impl( break; } - if partition == MinResidentSizePartition::Below && warned.is_none() { + if partition == &MinResidentSizePartition::Below && warned.is_none() { warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); warned = Some(usage_planned); } usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size); - - // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn - // tasks to evict all seen layers until we have evicted enough - - let batch = batched.entry(TimelineKey(candidate.timeline)).or_default(); - - // semaphore will later be used to limit eviction concurrency, and we can express at - // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted, - // but fail gracefully by not making batches larger. - if batch.len() < u32::MAX as usize { - batch.push(candidate.layer); - max_batch_size = max_batch_size.max(batch.len()); - } + evicted_amount += 1; } let usage_planned = match warned { @@ -372,100 +367,79 @@ pub async fn disk_usage_eviction_task_iteration_impl( }; debug!(?usage_planned, "usage planned"); - // phase2: evict victims batched by timeline + // phase2: evict layers let mut js = tokio::task::JoinSet::new(); + let limit = 1000; - // ratelimit to 1k files or any higher max batch size - let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size))); + let mut evicted = candidates.into_iter().take(evicted_amount).fuse(); + let mut consumed_all = false; - for (timeline, batch) in batched { - let tenant_shard_id = timeline.tenant_shard_id; - let timeline_id = timeline.timeline_id; - let batch_size = - u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning"); + // After the evictions, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); - // I dislike naming of `available_permits` but it means current total amount of permits - // because permits can be added - assert!(batch_size as usize <= limit.available_permits()); + let evict_layers = async move { + loop { + let next = if js.len() >= limit || consumed_all { + js.join_next().await + } else if !js.is_empty() { + // opportunistically consume ready result, one per each new evicted + futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x) + } else { + None + }; - debug!(%timeline_id, "evicting batch for timeline"); - - let evict = { - let limit = limit.clone(); - let cancel = cancel.clone(); - async move { - let mut evicted_bytes = 0; - let mut evictions_failed = LayerCount::default(); - - let Ok(_permit) = limit.acquire_many_owned(batch_size).await else { - // semaphore closing means cancelled - return (evicted_bytes, evictions_failed); - }; - - let results = timeline.evict_layers(&batch).await; - - match results { - Ok(results) => { - assert_eq!(results.len(), batch.len()); - for (result, layer) in results.into_iter().zip(batch.iter()) { - let file_size = layer.layer_desc().file_size; - match result { - Some(Ok(())) => { - evicted_bytes += file_size; - } - Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { - evictions_failed.file_sizes += file_size; - evictions_failed.count += 1; - } - None => { - assert!(cancel.is_cancelled()); - } - } - } + if let Some(next) = next { + match next { + Ok(Ok(file_size)) => { + usage_assumed.add_available_bytes(file_size); } - Err(e) => { - warn!("failed to evict batch: {:#}", e); + Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + evictions_failed.file_sizes += file_size; + evictions_failed.count += 1; } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { /* already logged */ } + Err(je) => tracing::error!("unknown JoinError: {je:?}"), } - (evicted_bytes, evictions_failed) } - } - .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size)); - js.spawn(evict); - - // spwaning multiple thousands of these is essentially blocking, so give already spawned a - // chance of making progress - tokio::task::yield_now().await; - } - - let join_all = async move { - // After the evictions, `usage_assumed` is the post-eviction usage, - // according to internal accounting. - let mut usage_assumed = usage_pre; - let mut evictions_failed = LayerCount::default(); - - while let Some(res) = js.join_next().await { - match res { - Ok((evicted_bytes, failed)) => { - usage_assumed.add_available_bytes(evicted_bytes); - evictions_failed.file_sizes += failed.file_sizes; - evictions_failed.count += failed.count; - } - Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { /* already logged */ } - Err(je) => tracing::error!("unknown JoinError: {je:?}"), + if consumed_all && js.is_empty() { + break; } + + // calling again when consumed_all is fine as evicted is fused. + let Some((_partition, candidate)) = evicted.next() else { + consumed_all = true; + continue; + }; + + js.spawn(async move { + let rtc = candidate.timeline.remote_client.as_ref().expect( + "holding the witness, all timelines must have a remote timeline client", + ); + let file_size = candidate.layer.layer_desc().file_size; + candidate + .layer + .evict_and_wait(rtc) + .await + .map(|()| file_size) + .map_err(|e| (file_size, e)) + }); + + tokio::task::yield_now().await; } + (usage_assumed, evictions_failed) }; let (usage_assumed, evictions_failed) = tokio::select! { - tuple = join_all => { tuple }, + tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { - // close the semaphore to stop any pending acquires - limit.close(); + // dropping joinset will abort all pending evict_and_waits and that is fine, our + // requests will still stand return Ok(IterationOutcome::Cancelled); } }; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 1c083bd382..c82d1c0362 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,4 +1,2 @@ pub mod routes; pub use routes::make_router; - -pub use pageserver_api::models; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 9422ccb2fd..b79c5ada9a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -992,8 +992,8 @@ paths: type: string post: description: | - Create a timeline. Returns new timeline id on success.\ - If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + Create a timeline. Returns new timeline id on success. + Recreating the same timeline will succeed if the parameters match the existing timeline. If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: @@ -1405,6 +1405,8 @@ components: type: integer trace_read_requests: type: boolean + heatmap_period: + type: integer TenantConfigResponse: type: object properties: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index fee50460a5..601fad5bde 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -28,20 +28,18 @@ use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; -use super::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineGcRequest, TimelineInfo, -}; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; +use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, TenantSlotUpsertError, TenantStateError, }; +use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::timeline::CompactFlags; @@ -49,6 +47,10 @@ use crate::tenant::timeline::Timeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; +use pageserver_api::models::{ + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, +}; use utils::{ auth::SwappableJwtAuth, generation::Generation, @@ -64,7 +66,12 @@ use utils::{ }; // Imports only used for testing APIs -use super::models::ConfigureFailpointsRequest; +use pageserver_api::models::ConfigureFailpointsRequest; + +// For APIs that require an Active tenant, how long should we block waiting for that state? +// This is not functionally necessary (clients will retry), but avoids generating a lot of +// failed API calls while tenants are activating. +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); pub struct State { conf: &'static PageServerConf, @@ -75,9 +82,11 @@ pub struct State { broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, + secondary_controller: SecondaryController, } impl State { + #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, tenant_manager: Arc, @@ -86,6 +95,7 @@ impl State { broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, + secondary_controller: SecondaryController, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"] .iter() @@ -100,6 +110,7 @@ impl State { broker_client, disk_usage_eviction_state, deletion_queue_client, + secondary_controller, }) } @@ -136,11 +147,6 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } @@ -233,6 +239,19 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(e: GetActiveTenantError) -> ApiError { + match e { + GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), + GetActiveTenantError::Cancelled => ApiError::ShuttingDown, + GetActiveTenantError::NotFound(gte) => gte.into(), + GetActiveTenantError::WaitForActiveTimeout { .. } => { + ApiError::ResourceUnavailable(format!("{}", e).into()) + } + } + } +} + impl From for ApiError { fn from(e: SetNewTenantConfigError) -> ApiError { match e { @@ -435,7 +454,10 @@ async fn timeline_create_handler( let state = get_state(&request); async { - let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?; + let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + match tenant.create_timeline( new_timeline_id, request_data.ancestor_timeline_id.map(TimelineId::from), @@ -453,7 +475,7 @@ async fn timeline_create_handler( .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } - Err(tenant::CreateTimelineError::AlreadyExists) => { + Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { json_response(StatusCode::CONFLICT, ()) } Err(tenant::CreateTimelineError::AncestorLsn(err)) => { @@ -694,11 +716,23 @@ async fn timeline_delete_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); - state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx) - .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false) + .map_err(|e| { + match e { + // GetTenantError has a built-in conversion to ApiError, but in this context we don't + // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. + GetTenantError::NotFound(_) => ApiError::PreconditionFailed( + "Requested tenant is missing".to_string().into_boxed_str(), + ), + e => e.into(), + } + })?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -1136,7 +1170,10 @@ async fn tenant_create_handler( // We created the tenant. Existing API semantics are that the tenant // is Active when this function returns. - if let res @ Err(_) = new_tenant.wait_to_become_active().await { + if let res @ Err(_) = new_tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + { // This shouldn't happen because we just created the tenant directory // in tenant::mgr::create_tenant, and there aren't any remote timelines // to load, so, nothing can really fail during load. @@ -1487,69 +1524,6 @@ async fn timeline_collect_keyspace( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - struct Partitioning { - keys: crate::keyspace::KeySpace, - - at_lsn: Lsn, - } - - impl serde::Serialize for Partitioning { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeMap; - let mut map = serializer.serialize_map(Some(2))?; - map.serialize_key("keys")?; - map.serialize_value(&KeySpace(&self.keys))?; - map.serialize_key("at_lsn")?; - map.serialize_value(&WithDisplay(&self.at_lsn))?; - map.end() - } - } - - struct WithDisplay<'a, T>(&'a T); - - impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - serializer.collect_str(&self.0) - } - } - - struct KeySpace<'a>(&'a crate::keyspace::KeySpace); - - impl<'a> serde::Serialize for KeySpace<'a> { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - use serde::ser::SerializeSeq; - let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?; - for kr in &self.0.ranges { - seq.serialize_element(&KeyRange(kr))?; - } - seq.end() - } - } - - struct KeyRange<'a>(&'a std::ops::Range); - - impl<'a> serde::Serialize for KeyRange<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - use serde::ser::SerializeTuple; - let mut t = serializer.serialize_tuple(2)?; - t.serialize_element(&WithDisplay(&self.0.start))?; - t.serialize_element(&WithDisplay(&self.0.end))?; - t.end() - } - } - let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { @@ -1561,7 +1535,9 @@ async fn timeline_collect_keyspace( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - json_response(StatusCode::OK, Partitioning { keys, at_lsn }) + let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + + json_response(StatusCode::OK, res) } .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await @@ -1593,7 +1569,7 @@ async fn always_panic_handler( async fn disk_usage_eviction_run( mut r: Request, - _cancel: CancellationToken, + cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; @@ -1621,57 +1597,48 @@ async fn disk_usage_eviction_run( } } - let config = json_request::(&mut r) - .await - .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + let config = json_request::(&mut r).await?; let usage = Usage { config, freed_bytes: 0, }; - let (tx, rx) = tokio::sync::oneshot::channel(); - let state = get_state(&r); - if state.remote_storage.as_ref().is_none() { + let Some(storage) = state.remote_storage.as_ref() else { return Err(ApiError::InternalServerError(anyhow::anyhow!( "remote storage not configured, cannot run eviction iteration" ))); - } + }; let state = state.disk_usage_eviction_state.clone(); - let cancel = CancellationToken::new(); - let child_cancel = cancel.clone(); - let _g = cancel.drop_guard(); + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, storage, usage, &cancel, + ) + .await; - crate::task_mgr::spawn( - crate::task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, - "ondemand disk usage eviction", - false, - async move { - let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, - usage, - &child_cancel, - ) - .await; + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); - info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + let res = res.map_err(ApiError::InternalServerError)?; - let _ = tx.send(res); - Ok(()) - } - .in_current_span(), - ); + json_response(StatusCode::OK, res) +} - let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; +async fn secondary_upload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + state + .secondary_controller + .upload_tenant(tenant_shard_id) + .await + .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + json_response(StatusCode::OK, ()) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -1933,6 +1900,9 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { + api_handler(r, secondary_upload_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 797cb6f944..58adf6e8c4 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -10,7 +10,7 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; -pub mod keyspace; +pub use pageserver_api::keyspace; pub mod metrics; pub mod page_cache; pub mod page_service; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 7cc0333ee5..45c01b71d1 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2,9 +2,10 @@ use enum_map::EnumMap; use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, - register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, - register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram, - HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, + register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec, + IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; @@ -683,14 +684,54 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); -/// How long did tenants take to go from construction to active state? -pub(crate) static TENANT_ACTIVATION: Lazy = Lazy::new(|| { - register_histogram!( +/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things +/// like how long it took to load. +/// +/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant +/// metrics are rather expensive, and usually fine grained stuff makes more sense +/// at a timeline level than tenant level. +pub(crate) struct TenantMetrics { + /// How long did tenants take to go from construction to active state? + pub(crate) activation: Histogram, + pub(crate) preload: Histogram, + pub(crate) attach: Histogram, + + /// How many tenants are included in the initial startup of the pagesrever? + pub(crate) startup_scheduled: IntCounter, + pub(crate) startup_complete: IntCounter, +} + +pub(crate) static TENANT: Lazy = Lazy::new(|| { + TenantMetrics { + activation: register_histogram!( "pageserver_tenant_activation_seconds", "Time taken by tenants to activate, in seconds", CRITICAL_OP_BUCKETS.into() ) - .expect("Failed to register pageserver_tenant_activation_seconds metric") + .expect("Failed to register metric"), + preload: register_histogram!( + "pageserver_tenant_preload_seconds", + "Time taken by tenants to load remote metadata on startup/attach, in seconds", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), + attach: register_histogram!( + "pageserver_tenant_attach_seconds", + "Time taken by tenants to intialize, after remote metadata is already loaded", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), + startup_scheduled: register_int_counter!( + "pageserver_tenant_startup_scheduled", + "Number of tenants included in pageserver startup (doesn't count tenants attached later)" + ).expect("Failed to register metric"), + startup_complete: register_int_counter!( + "pageserver_tenant_startup_complete", + "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \ + should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \ + tenants: such cases will lead to this metric never reaching the scheduled count." + ).expect("Failed to register metric"), +} }); /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. @@ -1270,6 +1311,28 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet ) .expect("failed to define a metric"), }); +pub(crate) struct SecondaryModeMetrics { + pub(crate) upload_heatmap: IntCounter, + pub(crate) upload_heatmap_errors: IntCounter, + pub(crate) upload_heatmap_duration: Histogram, +} +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { + upload_heatmap: register_int_counter!( + "pageserver_secondary_upload_heatmap", + "Number of heatmaps written to remote storage by attached tenants" + ) + .expect("failed to define a metric"), + upload_heatmap_errors: register_int_counter!( + "pageserver_secondary_upload_heatmap_errors", + "Failures writing heatmap to remote storage" + ) + .expect("failed to define a metric"), + upload_heatmap_duration: register_histogram!( + "pageserver_secondary_upload_heatmap_duration", + "Time to build and upload a heatmap, including any waiting inside the S3 client" + ) + .expect("failed to define a metric"), +}); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { @@ -1321,25 +1384,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy = - Lazy::new(|| { - register_int_counter_vec!( - "pageserver_background_loop_semaphore_wait_start_count", - "Counter for background loop concurrency-limiting semaphore acquire calls started", - &["task"], - ) - .unwrap() - }); - -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy = - Lazy::new(|| { - register_int_counter_vec!( - "pageserver_background_loop_semaphore_wait_finish_count", - "Counter for background loop concurrency-limiting semaphore acquire calls finished", - &["task"], - ) - .unwrap() - }); +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap() +}); pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( @@ -2199,6 +2253,9 @@ pub fn preinitialize_metrics() { // Deletion queue stats Lazy::force(&DELETION_QUEUE); + // Tenant stats + Lazy::force(&TENANT); + // Tenant manager stats Lazy::force(&TENANT_MANAGER); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 97d731bf49..c726139524 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use std::ops::{AddAssign, Range}; +use std::ops::AddAssign; use std::time::Duration; pub use pageserver_api::key::{Key, KEY_SIZE}; -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; - - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 - } -} - -pub fn singleton_range(key: Key) -> Range { - key..key.next() -} - /// A 'value' stored for a one Key. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(test, derive(PartialEq))] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 8747d9ad50..cb1b2b8011 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -258,6 +258,9 @@ pub enum TaskKind { /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, + /// See [`crate::tenant::secondary`]. + SecondaryUploads, + // Initial logical size calculation InitialLogicalSizeCalculation, @@ -558,9 +561,14 @@ pub async fn shutdown_watcher() { /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or /// `tokio::task::JoinSet::spawn`. pub fn shutdown_token() -> CancellationToken { - SHUTDOWN_TOKEN - .try_with(|t| t.clone()) - .expect("shutdown_token() called in an unexpected task or thread") + let res = SHUTDOWN_TOKEN.try_with(|t| t.clone()); + + if cfg!(test) { + // in tests this method is called from non-taskmgr spawned tasks, and that is all ok. + res.unwrap_or_default() + } else { + res.expect("shutdown_token() called in an unexpected task or thread") + } } /// Has the current task been requested to shut down? diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a8e8b4cbfa..eceef6bf78 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -36,6 +36,8 @@ use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; +use utils::timeout::timeout_cancellable; +use utils::timeout::TimeoutCancellableError; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; @@ -48,6 +50,7 @@ use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::mgr::TenantsMap; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; @@ -58,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; -use crate::metrics::TENANT_ACTIVATION; +use crate::metrics::TENANT; use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; use crate::repository::GcResult; use crate::task_mgr; @@ -87,7 +90,6 @@ use std::process::Stdio; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::MutexGuard; use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; @@ -144,6 +146,7 @@ pub mod storage_layer; pub mod config; pub mod delete; pub mod mgr; +pub mod secondary; pub mod tasks; pub mod upload_queue; @@ -225,7 +228,7 @@ pub struct Tenant { /// The value creation timestamp, used to measure activation delay, see: /// - loading_started_at: Instant, + constructed_at: Instant, state: watch::Sender, @@ -248,6 +251,12 @@ pub struct Tenant { generation: Generation, timelines: Mutex>>, + + /// During timeline creation, we first insert the TimelineId to the + /// creating map, then `timelines`, then remove it from the creating map. + /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating` + timelines_creating: std::sync::Mutex>, + // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration @@ -269,6 +278,11 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it + /// to proceed to Active as soon as possible, rather than waiting for lazy + /// background warmup. + pub(crate) activate_now_sem: tokio::sync::Semaphore, + pub(crate) delete_progress: Arc>, // Cancellation token fires when we have entered shutdown(). This is a parent of @@ -406,8 +420,10 @@ impl Debug for SetStoppingError { #[derive(thiserror::Error, Debug)] pub enum CreateTimelineError { - #[error("a timeline with the given ID already exists")] - AlreadyExists, + #[error("creation of timeline with the given ID is in progress")] + AlreadyCreating, + #[error("timeline already exists with different parameters")] + Conflict, #[error(transparent)] AncestorLsn(anyhow::Error), #[error("ancestor timeline is not active")] @@ -613,6 +629,14 @@ impl Tenant { "attach tenant", false, async move { + // Is this tenant being spawned as part of process startup? + let starting_up = init_order.is_some(); + scopeguard::defer! { + if starting_up { + TENANT.startup_complete.inc(); + } + } + // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. let make_broken = |t: &Tenant, err: anyhow::Error| { @@ -639,8 +663,62 @@ impl Tenant { .as_mut() .and_then(|x| x.initial_tenant_load_remote.take()); + enum AttachType<'a> { + // During pageserver startup, we are attaching this tenant lazily in the background + Warmup(tokio::sync::SemaphorePermit<'a>), + // During pageserver startup, we are attaching this tenant as soon as we can, + // because a client tried to access it. + OnDemand, + // During normal operations after startup, we are attaching a tenant. + Normal, + } + + // Before doing any I/O, wait for either or: + // - A client to attempt to access to this tenant (on-demand loading) + // - A permit to become available in the warmup semaphore (background warmup) + // + // Some-ness of init_order is how we know if we're attaching during startup or later + // in process lifetime. + let attach_type = if init_order.is_some() { + tokio::select!( + _ = tenant_clone.activate_now_sem.acquire() => { + tracing::info!("Activating tenant (on-demand)"); + AttachType::OnDemand + }, + permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { + match permit_result { + Ok(p) => { + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup(p) + } + Err(_) => { + // This is unexpected: the warmup semaphore should stay alive + // for the lifetime of init_order. Log a warning and proceed. + tracing::warn!("warmup_limit semaphore unexpectedly closed"); + AttachType::Normal + } + } + + } + _ = tenant_clone.cancel.cancelled() => { + // This is safe, but should be pretty rare: it is interesting if a tenant + // stayed in Activating for such a long time that shutdown found it in + // that state. + tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation"); + return Ok(()); + }, + ) + } else { + AttachType::Normal + }; + + let preload_timer = TENANT.preload.start_timer(); let preload = match mode { - SpawnMode::Create => {None}, + SpawnMode::Create => { + // Don't count the skipped preload into the histogram of preload durations + preload_timer.stop_and_discard(); + None + }, SpawnMode::Normal => { match &remote_storage { Some(remote_storage) => Some( @@ -650,7 +728,11 @@ impl Tenant { tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), ) .await { - Ok(p) => p, + Ok(p) => { + preload_timer.observe_duration(); + p + } + , Err(e) => { make_broken(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); @@ -712,15 +794,43 @@ impl Tenant { } } + // We will time the duration of the attach phase unless this is a creation (attach will do no work) + let attach_timer = match mode { + SpawnMode::Create => None, + SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + }; match tenant_clone.attach(preload, &ctx).await { Ok(()) => { info!("attach finished, activating"); + if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { + if let Some(t)= attach_timer {t.observe_duration();} make_broken(&tenant_clone, anyhow::anyhow!(e)); } } + + // If we are doing an opportunistic warmup attachment at startup, initialize + // logical size at the same time. This is better than starting a bunch of idle tenants + // with cold caches and then coming back later to initialize their logical sizes. + // + // It also prevents the warmup proccess competing with the concurrency limit on + // logical size calculations: if logical size calculation semaphore is saturated, + // then warmup will wait for that before proceeding to the next tenant. + if let AttachType::Warmup(_permit) = attach_type { + let mut futs = FuturesUnordered::new(); + let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect(); + for t in timelines { + futs.push(t.await_initial_logical_size()) + } + tracing::info!("Waiting for initial logical sizes while warming up..."); + while futs.next().await.is_some() { + + } + tracing::info!("Warm-up complete"); + } + Ok(()) } .instrument({ @@ -1457,7 +1567,7 @@ impl Tenant { /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) - pub async fn create_empty_timeline( + pub(crate) async fn create_empty_timeline( &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, @@ -1469,10 +1579,7 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(new_timeline_id, &timelines)? - }; + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?; let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1549,7 +1656,7 @@ impl Tenant { /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. #[allow(clippy::too_many_arguments)] - pub async fn create_timeline( + pub(crate) async fn create_timeline( &self, new_timeline_id: TimelineId, ancestor_timeline_id: Option, @@ -1570,26 +1677,51 @@ impl Tenant { .enter() .map_err(|_| CreateTimelineError::ShuttingDown)?; - if let Ok(existing) = self.get_timeline(new_timeline_id, false) { - debug!("timeline {new_timeline_id} already exists"); - - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; + // Get exclusive access to the timeline ID: this ensures that it does not already exist, + // and that no other creation attempts will be allowed in while we are working. The + // uninit_mark is a guard. + let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) { + Ok(m) => m, + Err(TimelineExclusionError::AlreadyCreating) => { + // Creation is in progress, we cannot create it again, and we cannot + // check if this request matches the existing one, so caller must try + // again later. + return Err(CreateTimelineError::AlreadyCreating); } + Err(TimelineExclusionError::Other(e)) => { + return Err(CreateTimelineError::Other(e)); + } + Err(TimelineExclusionError::AlreadyExists(existing)) => { + debug!("timeline {new_timeline_id} already exists"); - return Err(CreateTimelineError::AlreadyExists); - } + // Idempotency: creating the same timeline twice is not an error, unless + // the second creation has different parameters. + if existing.get_ancestor_timeline_id() != ancestor_timeline_id + || existing.pg_version != pg_version + || (ancestor_start_lsn.is_some() + && ancestor_start_lsn != Some(existing.get_ancestor_lsn())) + { + return Err(CreateTimelineError::Conflict); + } + + if let Some(remote_client) = existing.remote_client.as_ref() { + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; + } + + return Ok(existing); + } + }; let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { @@ -1626,18 +1758,32 @@ impl Tenant { ancestor_timeline.wait_lsn(*lsn, ctx).await?; } - self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) - .await? + self.branch_timeline( + &ancestor_timeline, + new_timeline_id, + ancestor_start_lsn, + uninit_mark, + ctx, + ) + .await? } None => { - self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx) - .await? + self.bootstrap_timeline( + new_timeline_id, + pg_version, + load_existing_initdb, + uninit_mark, + ctx, + ) + .await? } }; + // At this point we have dropped our guard on [`Self::timelines_creating`], and + // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must + // not send a success to the caller until it is. The same applies to handling retries, + // see the handling of [`TimelineExclusionError::AlreadyExists`] above. if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - // Wait for the upload of the 'index_part.json` file to finish, so that when we return - // Ok, the timeline is durable in remote storage. let kind = ancestor_timeline_id .map(|_| "branched") .unwrap_or("bootstrapped"); @@ -1651,6 +1797,15 @@ impl Tenant { Ok(loaded_timeline) } + pub(crate) async fn delete_timeline( + self: Arc, + timeline_id: TimelineId, + ) -> Result<(), DeleteTimelineError> { + DeleteTimelineFlow::run(&self, timeline_id, false).await?; + + Ok(()) + } + /// perform one garbage collection iteration, removing old data files from disk. /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. @@ -1812,7 +1967,7 @@ impl Tenant { ); *current_state = TenantState::Active; - let elapsed = self.loading_started_at.elapsed(); + let elapsed = self.constructed_at.elapsed(); let total_timelines = timelines_accessor.len(); // log a lot of stuff, because some tenants sometimes suffer from user-visible @@ -1827,7 +1982,7 @@ impl Tenant { "activation attempt finished" ); - TENANT_ACTIVATION.observe(elapsed.as_secs_f64()); + TENANT.activation.observe(elapsed.as_secs_f64()); }); } } @@ -2082,18 +2237,41 @@ impl Tenant { self.state.subscribe() } - pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> { + /// The activate_now semaphore is initialized with zero units. As soon as + /// we add a unit, waiters will be able to acquire a unit and proceed. + pub(crate) fn activate_now(&self) { + self.activate_now_sem.add_permits(1); + } + + pub(crate) async fn wait_to_become_active( + &self, + timeout: Duration, + ) -> Result<(), GetActiveTenantError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active - receiver.changed().await.map_err( - |_e: tokio::sync::watch::error::RecvError| - // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) - )?; + self.activate_now(); + match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await { + Ok(r) => { + r.map_err( + |_e: tokio::sync::watch::error::RecvError| + // Tenant existed but was dropped: report it as non-existent + GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) + )? + } + Err(TimeoutCancellableError::Cancelled) => { + return Err(GetActiveTenantError::Cancelled); + } + Err(TimeoutCancellableError::Timeout) => { + return Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state: Some(self.current_state()), + wait_time: timeout, + }); + } + } } TenantState::Active { .. } => { return Ok(()); @@ -2114,6 +2292,14 @@ impl Tenant { .attach_mode .clone() } + + pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } + + pub(crate) fn get_generation(&self) -> Generation { + self.generation + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2252,6 +2438,18 @@ impl Tenant { .or(self.conf.default_tenant_conf.min_resident_size_override) } + pub fn get_heatmap_period(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let heatmap_period = tenant_conf + .heatmap_period + .unwrap_or(self.conf.default_tenant_conf.heatmap_period); + if heatmap_period.is_zero() { + None + } else { + Some(heatmap_period) + } + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; // Don't hold self.timelines.lock() during the notifies. @@ -2398,9 +2596,10 @@ impl Tenant { conf, // using now here is good enough approximation to catch tenants with really long // activation times. - loading_started_at: Instant::now(), + constructed_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), + timelines_creating: Mutex::new(HashSet::new()), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, @@ -2409,6 +2608,7 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + activate_now_sem: tokio::sync::Semaphore::new(0), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), @@ -2792,8 +2992,9 @@ impl Tenant { start_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { + let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) @@ -2807,9 +3008,10 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx) .await } @@ -2818,13 +3020,14 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, _ctx: &RequestContext, ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; - // First acquire the GC lock so that another task cannot advance the GC - // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are - // creating the branch. + // We will validate our ancestor LSN in this function. Acquire the GC lock so that + // this check cannot race with GC, and the ancestor LSN is guaranteed to remain + // valid while we are creating the branch. let _gc_cs = self.gc_cs.lock().await; // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -2834,13 +3037,6 @@ impl Tenant { lsn }); - // Create a placeholder for the new branch. This will error - // out if the new timeline ID is already in use. - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(dst_id, &timelines)? - }; - // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR // horizon on the source timeline // @@ -2932,21 +3128,38 @@ impl Tenant { Ok(new_timeline) } - /// - run initdb to init temporary instance and get bootstrap data - /// - after initialization completes, tar up the temp dir and upload it to S3. - /// - /// The caller is responsible for activating the returned timeline. - pub(crate) async fn bootstrap_timeline( + /// For unit tests, make this visible so that other modules can directly create timelines + #[cfg(test)] + pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, pg_version: u32, load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(timeline_id, &timelines)? - }; + let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap(); + self.bootstrap_timeline( + timeline_id, + pg_version, + load_existing_initdb, + uninit_mark, + ctx, + ) + .await + } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization completes, tar up the temp dir and upload it to S3. + /// + /// The caller is responsible for activating the returned timeline. + async fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + load_existing_initdb: Option, + timeline_uninit_mark: TimelineUninitMark<'_>, + ctx: &RequestContext, + ) -> anyhow::Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. @@ -2980,6 +3193,7 @@ impl Tenant { storage, &self.tenant_shard_id, &existing_initdb_timeline_id, + &self.cancel, ) .await .context("download initdb tar")?; @@ -3020,6 +3234,7 @@ impl Tenant { &timeline_id, pgdata_zstd.try_clone().await?, tar_zst_size, + &self.cancel, ) .await }, @@ -3027,8 +3242,7 @@ impl Tenant { 3, u32::MAX, "persist_initdb_tar_zst", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await?; @@ -3143,11 +3357,11 @@ impl Tenant { /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map and to remove the /// uninit mark file. - async fn prepare_new_timeline( - &self, + async fn prepare_new_timeline<'a>( + &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - uninit_mark: TimelineUninitMark, + uninit_mark: TimelineUninitMark<'a>, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { @@ -3220,23 +3434,38 @@ impl Tenant { fn create_timeline_uninit_mark( &self, timeline_id: TimelineId, - timelines: &MutexGuard>>, - ) -> anyhow::Result { + ) -> Result { let tenant_shard_id = self.tenant_shard_id; - anyhow::ensure!( - timelines.get(&timeline_id).is_none(), - "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory" - ); - let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); - anyhow::ensure!( - !timeline_path.exists(), - "Timeline {timeline_path} already exists, cannot create its uninit mark file", - ); - let uninit_mark_path = self .conf .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); + let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); + + let uninit_mark = TimelineUninitMark::new( + self, + timeline_id, + uninit_mark_path.clone(), + timeline_path.clone(), + )?; + + // At this stage, we have got exclusive access to in-memory state for this timeline ID + // for creation. + // A timeline directory should never exist on disk already: + // - a previous failed creation would have cleaned up after itself + // - a pageserver restart would clean up timeline directories that don't have valid remote state + // + // Therefore it is an unexpected internal error to encounter a timeline directory already existing here, + // this error may indicate a bug in cleanup on failed creations. + if timeline_path.exists() { + return Err(TimelineExclusionError::Other(anyhow::anyhow!( + "Timeline directory already exists! This is a bug." + ))); + } + + // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees + // that during process runtime, colliding creations will be caught in-memory without getting + // as far as failing to write a file. fs::OpenOptions::new() .write(true) .create_new(true) @@ -3250,8 +3479,6 @@ impl Tenant { format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") })?; - let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); - Ok(uninit_mark) } @@ -3694,6 +3921,7 @@ pub(crate) mod harness { tenant_conf.evictions_low_residence_duration_metric_threshold, ), gc_feedback: Some(tenant_conf.gc_feedback), + heatmap_period: Some(tenant_conf.heatmap_period), } } } @@ -4000,13 +4228,7 @@ mod tests { .await { Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!( - e.to_string(), - format!( - "Timeline {}/{} already exists in pageserver's memory", - tenant.tenant_shard_id, TIMELINE_ID - ) - ), + Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()), } Ok(()) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7a454b53d2..25d97f51ce 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -334,6 +334,11 @@ pub struct TenantConf { #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, pub gc_feedback: bool, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + pub heatmap_period: Duration, } /// Same as TenantConf, but this struct preserves the information about @@ -414,6 +419,11 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_feedback: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub heatmap_period: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -482,6 +492,7 @@ impl TenantConfOpt { .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), + heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), } } } @@ -519,6 +530,7 @@ impl Default for TenantConf { ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), gc_feedback: false, + heatmap_period: Duration::ZERO, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index acd311ace6..e8491f26db 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -71,6 +71,7 @@ async fn create_remote_delete_mark( conf: &PageServerConf, remote_storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; @@ -87,8 +88,7 @@ async fn create_remote_delete_mark( FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "mark_upload", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .context("mark_upload")?; @@ -170,6 +170,7 @@ async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant_shard_id: &TenantShardId, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { if let Some(remote_storage) = remote_storage { let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; @@ -179,8 +180,7 @@ async fn remove_tenant_remote_delete_mark( FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "remove_tenant_remote_delete_mark", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .context("remove_tenant_remote_delete_mark")?; @@ -322,9 +322,15 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id) - .await - .context("remote_mark")? + create_remote_delete_mark( + conf, + remote_storage, + &tenant.tenant_shard_id, + // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token + &CancellationToken::new(), + ) + .await + .context("remote_mark")? } fail::fail_point!("tenant-delete-before-create-local-mark", |_| { @@ -524,8 +530,14 @@ impl DeleteTenantFlow { .context("timelines dir not empty")?; } - remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id) - .await?; + remove_tenant_remote_delete_mark( + conf, + remote_storage.as_ref(), + &tenant.tenant_shard_id, + // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token + &CancellationToken::new(), + ) + .await?; fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { Err(anyhow::anyhow!( diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4d7bd4259f..b2f14db9f7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -28,7 +28,7 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::TENANT_MANAGER as METRICS; +use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, @@ -44,7 +44,6 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; -use super::timeline::delete::DeleteTimelineFlow; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -430,6 +429,13 @@ pub async fn init_tenant_mgr( let tenant_generations = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + tracing::info!( + "Attaching {} tenants at startup, warming up {} at a time", + tenant_configs.len(), + conf.concurrent_tenant_warmup.initial_permits() + ); + TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); + // Construct `Tenant` objects and start them running for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); @@ -807,6 +813,12 @@ pub(crate) async fn set_new_tenant_config( } impl TenantManager { + /// Convenience function so that anyone with a TenantManager can get at the global configuration, without + /// having to pass it around everywhere as a separate object. + pub(crate) fn get_conf(&self) -> &'static PageServerConf { + self.conf + } + /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. pub(crate) fn get_attached_tenant_shard( @@ -842,17 +854,6 @@ impl TenantManager { } } - pub(crate) async fn delete_timeline( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - _ctx: &RequestContext, - ) -> Result<(), DeleteTimelineError> { - let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?; - DeleteTimelineFlow::run(&tenant, timeline_id, false).await?; - Ok(()) - } - #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, @@ -1087,6 +1088,20 @@ impl TenantManager { Ok(()) } + + pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec> { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => Vec::new(), + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map + .values() + .filter_map(|slot| { + slot.get_attached() + .and_then(|t| if t.is_active() { Some(t.clone()) } else { None }) + }) + .collect(), + } + } } #[derive(Debug, thiserror::Error)] @@ -1201,7 +1216,10 @@ pub(crate) async fn get_active_tenant_with_timeout( // Fast path: we don't need to do any async waiting. return Ok(tenant.clone()); } - _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id), + _ => { + tenant.activate_now(); + (WaitFor::Tenant(tenant.clone()), tenant_shard_id) + } } } Some(TenantSlot::Secondary) => { @@ -1255,28 +1273,10 @@ pub(crate) async fn get_active_tenant_with_timeout( }; tracing::debug!("Waiting for tenant to enter active state..."); - match timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - tenant.wait_to_become_active(), - ) - .await - { - Ok(Ok(())) => Ok(tenant), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => { - let latest_state = tenant.current_state(); - if latest_state == TenantState::Active { - Ok(tenant) - } else { - Err(GetActiveTenantError::WaitForActiveTimeout { - latest_state: Some(latest_state), - wait_time: timeout, - }) - } - } - Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled), - } + tenant + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await?; + Ok(tenant) } pub(crate) async fn delete_tenant( diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 3765ff6e7a..52ee8f49ce 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -180,7 +180,7 @@ //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map -mod download; +pub(crate) mod download; pub mod index; mod upload; @@ -196,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; +use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; use std::ops::DerefMut; @@ -316,6 +318,47 @@ pub struct RemoteTimelineClient { storage_impl: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + + cancel: CancellationToken, +} + +/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not +/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that. +const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120); +const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120); + +/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow. +/// +/// This is a convenience for the various upload functions. In future +/// the anyhow::Error result should be replaced with a more structured type that +/// enables callers to avoid handling shutdown as an error. +async fn upload_cancellable(cancel: &CancellationToken, future: F) -> anyhow::Result<()> +where + F: std::future::Future>, +{ + match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await { + Ok(Ok(())) => Ok(()), + Ok(Err(e)) => Err(e), + Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")), + Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")), + } +} +/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError. +async fn download_cancellable( + cancel: &CancellationToken, + future: F, +) -> Result +where + F: std::future::Future>, +{ + match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await { + Ok(Ok(r)) => Ok(r), + Ok(Err(e)) => Err(e), + Err(TimeoutCancellableError::Timeout) => { + Err(DownloadError::Other(anyhow::anyhow!("Timed out"))) + } + Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled), + } } impl RemoteTimelineClient { @@ -351,6 +394,7 @@ impl RemoteTimelineClient { &tenant_shard_id, &timeline_id, )), + cancel: CancellationToken::new(), } } @@ -501,6 +545,7 @@ impl RemoteTimelineClient { &self, layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, + cancel: &CancellationToken, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -517,6 +562,7 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -971,6 +1017,7 @@ impl RemoteTimelineClient { &self.timeline_id, self.generation, &index_part_with_deleted_at, + &self.cancel, ) }, |_e| false, @@ -980,8 +1027,7 @@ impl RemoteTimelineClient { // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await?; @@ -1281,6 +1327,7 @@ impl RemoteTimelineClient { path, layer_metadata, self.generation, + &self.cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -1307,6 +1354,7 @@ impl RemoteTimelineClient { &self.timeline_id, self.generation, index_part, + &self.cancel, ) .measure_remote_op( self.tenant_shard_id.tenant_id, @@ -1604,6 +1652,23 @@ impl RemoteTimelineClient { } } } + + pub(crate) fn get_layers_metadata( + &self, + layers: Vec, + ) -> anyhow::Result>> { + let q = self.upload_queue.lock().unwrap(); + let q = match &*q { + UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { + anyhow::bail!("queue is in state {}", q.as_str()) + } + UploadQueue::Initialized(inner) => inner, + }; + + let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned()); + + Ok(decorated.collect()) + } } pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -1659,6 +1724,13 @@ pub fn remote_index_path( .expect("Failed to construct path") } +pub const HEATMAP_BASENAME: &str = "heatmap-v1.json"; + +pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { + RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}")) + .expect("Failed to construct path") +} + /// Given the key of an index, parse out the generation part of the name pub fn parse_remote_index_path(path: RemotePath) -> Option { let file_name = match path.get_path().file_name() { @@ -1804,6 +1876,7 @@ mod tests { &self.harness.tenant_shard_id, &TIMELINE_ID, )), + cancel: CancellationToken::new(), }) } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index ed32c4eed9..d3956163c8 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,7 +5,6 @@ use std::collections::HashSet; use std::future::Future; -use std::time::Duration; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -14,13 +13,17 @@ use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::warn; +use utils::timeout::timeout_cancellable; use utils::{backoff, crashsafe}; use crate::config::PageServerConf; -use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::remote_timeline_client::{ + download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, +}; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; +use crate::virtual_file::on_fatal_io_error; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; @@ -32,8 +35,6 @@ use super::{ FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; -static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); - /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -46,6 +47,7 @@ pub async fn download_layer_file<'a>( timeline_id: TimelineId, layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, + cancel: &CancellationToken, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -73,14 +75,18 @@ pub async fn download_layer_file<'a>( // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let cancel_inner = cancel.clone(); let (mut destination_file, bytes_amount) = download_retry( || async { let destination_file = tokio::fs::File::create(&temp_file_path) .await .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) .map_err(DownloadError::Other)?; - let download = storage - .download(&remote_path) + + // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local + // file: the write to local file doesn't start until after the request header is returned + // and we start draining the body stream below + let download = download_cancellable(&cancel_inner, storage.download(&remote_path)) .await .with_context(|| { format!( @@ -94,12 +100,33 @@ pub async fn download_layer_file<'a>( let mut reader = tokio_util::io::StreamReader::new(download.download_stream); - let bytes_amount = tokio::time::timeout( - MAX_DOWNLOAD_DURATION, + // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file, + // and we will unlink the temporary file if there is an error. This unlink is important because we + // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that + // we will imminiently try and write to again. + let bytes_amount: u64 = match timeout_cancellable( + DOWNLOAD_TIMEOUT, + &cancel_inner, tokio::io::copy_buf(&mut reader, &mut destination_file), ) .await - .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!( + "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" + ) + }) + .map_err(DownloadError::Other)? + { + Ok(b) => Ok(b), + Err(e) => { + // Remove incomplete files: on restart Timeline would do this anyway, but we must + // do it here for the retry case. + if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { + on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); + } + Err(e) + } + } .with_context(|| { format!( "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" @@ -112,6 +139,7 @@ pub async fn download_layer_file<'a>( Ok((destination_file, bytes_amount)) }, &format!("download {remote_path:?}"), + cancel, ) .await?; @@ -188,8 +216,14 @@ pub async fn list_remote_timelines( anyhow::bail!("storage-sync-list-remote-timelines"); }); + let cancel_inner = cancel.clone(); let listing = download_retry_forever( - || storage.list(Some(&remote_path), ListingMode::WithDelimiter), + || { + download_cancellable( + &cancel_inner, + storage.list(Some(&remote_path), ListingMode::WithDelimiter), + ) + }, &format!("list timelines for {tenant_shard_id}"), cancel, ) @@ -230,9 +264,13 @@ async fn do_download_index_part( let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); + let cancel_inner = cancel.clone(); let index_part_bytes = download_retry_forever( || async { - let index_part_download = storage.download(&remote_path).await?; + // Cancellation: if is safe to cancel this future because we're just downloading into + // a memory buffer, not touching local disk. + let index_part_download = + download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; let mut index_part_bytes = Vec::new(); let mut stream = std::pin::pin!(index_part_download.download_stream); @@ -347,10 +385,7 @@ pub(super) async fn download_index_part( FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "listing index_part files", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error { - unreachable!() - }), + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), ) .await .map_err(DownloadError::Other)?; @@ -389,6 +424,7 @@ pub(crate) async fn download_initdb_tar_zst( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, + cancel: &CancellationToken, ) -> Result<(Utf8PathBuf, File), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -406,6 +442,8 @@ pub(crate) async fn download_initdb_tar_zst( "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" )); + let cancel_inner = cancel.clone(); + let file = download_retry( || async { let file = OpenOptions::new() @@ -418,10 +456,14 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = storage.download(&remote_path).await?; + let download = + download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; let mut download = tokio_util::io::StreamReader::new(download.download_stream); let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); + // TODO: this consumption of the response body should be subject to timeout + cancellation, but + // not without thinking carefully about how to recover safely from cancelling a write to + // local storage (e.g. by writing into a temp file as we do in download_layer) tokio::io::copy_buf(&mut download, &mut writer) .await .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) @@ -437,6 +479,7 @@ pub(crate) async fn download_initdb_tar_zst( Ok(file) }, &format!("download {remote_path}"), + cancel, ) .await .map_err(|e| { @@ -460,7 +503,11 @@ pub(crate) async fn download_initdb_tar_zst( /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry(op: O, description: &str) -> Result +async fn download_retry( + op: O, + description: &str, + cancel: &CancellationToken, +) -> Result where O: FnMut() -> F, F: Future>, @@ -471,10 +518,7 @@ where FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || -> DownloadError { - unreachable!() - }), + backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled), ) .await } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index d0744e7c83..11c6956875 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -4,14 +4,17 @@ use anyhow::{bail, Context}; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::ErrorKind; +use std::io::{ErrorKind, SeekFrom}; use tokio::fs::{self, File}; +use tokio::io::AsyncSeekExt; +use tokio_util::sync::CancellationToken; use super::Generation; use crate::{ config::PageServerConf, tenant::remote_timeline_client::{ index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path, + upload_cancellable, }, }; use remote_storage::GenericRemoteStorage; @@ -28,6 +31,7 @@ pub(super) async fn upload_index_part<'a>( timeline_id: &TimelineId, generation: Generation, index_part: &'a IndexPart, + cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -43,14 +47,16 @@ pub(super) async fn upload_index_part<'a>( let index_part_bytes = bytes::Bytes::from(index_part_bytes); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); - storage - .upload_storage_object( + upload_cancellable( + cancel, + storage.upload_storage_object( futures::stream::once(futures::future::ready(Ok(index_part_bytes))), index_part_size, &remote_path, - ) - .await - .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) + ), + ) + .await + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. @@ -63,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>( source_path: &'a Utf8Path, known_metadata: &'a LayerFileMetadata, generation: Generation, + cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { bail!("failpoint before-upload-layer") @@ -106,8 +113,7 @@ pub(super) async fn upload_timeline_layer<'a>( let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); - storage - .upload(reader, fs_size, &storage_path, None) + upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None)) .await .with_context(|| format!("upload layer from local path '{source_path}'"))?; @@ -119,16 +125,22 @@ pub(crate) async fn upload_initdb_dir( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, - initdb_tar_zst: File, + mut initdb_tar_zst: File, size: u64, + cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading initdb dir"); + // We might have read somewhat into the file already in the prior retry attempt + initdb_tar_zst.seek(SeekFrom::Start(0)).await?; + let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); - storage - .upload_storage_object(file, size as usize, &remote_path) - .await - .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) + upload_cancellable( + cancel, + storage.upload_storage_object(file, size as usize, &remote_path), + ) + .await + .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs new file mode 100644 index 0000000000..d25fe56b92 --- /dev/null +++ b/pageserver/src/tenant/secondary.rs @@ -0,0 +1,104 @@ +pub mod heatmap; +mod heatmap_uploader; + +use std::sync::Arc; + +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; + +use self::heatmap_uploader::heatmap_uploader_task; + +use super::mgr::TenantManager; + +use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; + +use tokio_util::sync::CancellationToken; +use utils::completion::Barrier; + +enum UploadCommand { + Upload(TenantShardId), +} + +struct CommandRequest { + payload: T, + response_tx: tokio::sync::oneshot::Sender, +} + +struct CommandResponse { + result: anyhow::Result<()>, +} + +/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, +/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, +/// where we want to immediately upload/download for a particular tenant. In normal operation +/// uploads & downloads are autonomous and not driven by this interface. +pub struct SecondaryController { + upload_req_tx: tokio::sync::mpsc::Sender>, +} + +impl SecondaryController { + async fn dispatch( + &self, + queue: &tokio::sync::mpsc::Sender>, + payload: T, + ) -> anyhow::Result<()> { + let (response_tx, response_rx) = tokio::sync::oneshot::channel(); + + queue + .send(CommandRequest { + payload, + response_tx, + }) + .await + .map_err(|_| anyhow::anyhow!("Receiver shut down"))?; + + let response = response_rx + .await + .map_err(|_| anyhow::anyhow!("Request dropped"))?; + + response.result + } + + pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) + .await + } +} + +pub fn spawn_tasks( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) -> SecondaryController { + let (upload_req_tx, upload_req_rx) = + tokio::sync::mpsc::channel::>(16); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::SecondaryUploads, + None, + None, + "heatmap uploads", + false, + async move { + heatmap_uploader_task( + tenant_manager, + remote_storage, + upload_req_rx, + background_jobs_can_start, + cancel, + ) + .await + }, + ); + + SecondaryController { upload_req_tx } +} + +/// For running with remote storage disabled: a SecondaryController that is connected to nothing. +pub fn null_controller() -> SecondaryController { + let (upload_req_tx, _upload_req_rx) = + tokio::sync::mpsc::channel::>(16); + SecondaryController { upload_req_tx } +} diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs new file mode 100644 index 0000000000..99aaaeb8c8 --- /dev/null +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -0,0 +1,64 @@ +use std::time::SystemTime; + +use crate::tenant::{ + remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, +}; + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; + +use utils::{generation::Generation, id::TimelineId}; + +#[derive(Serialize, Deserialize)] +pub(super) struct HeatMapTenant { + /// Generation of the attached location that uploaded the heatmap: this is not required + /// for correctness, but acts as a hint to secondary locations in order to detect thrashing + /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. + pub(super) generation: Generation, + + pub(super) timelines: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub(crate) struct HeatMapTimeline { + #[serde_as(as = "DisplayFromStr")] + pub(super) timeline_id: TimelineId, + + pub(super) layers: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub(crate) struct HeatMapLayer { + pub(super) name: LayerFileName, + pub(super) metadata: IndexLayerMetadata, + + #[serde_as(as = "TimestampSeconds")] + pub(super) access_time: SystemTime, + // TODO: an actual 'heat' score that would let secondary locations prioritize downloading + // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. +} + +impl HeatMapLayer { + pub(crate) fn new( + name: LayerFileName, + metadata: IndexLayerMetadata, + access_time: SystemTime, + ) -> Self { + Self { + name, + metadata, + access_time, + } + } +} + +impl HeatMapTimeline { + pub(crate) fn new(timeline_id: TimelineId, layers: Vec) -> Self { + Self { + timeline_id, + layers, + } + } +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs new file mode 100644 index 0000000000..ece2b93ce1 --- /dev/null +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -0,0 +1,582 @@ +use std::{ + collections::HashMap, + sync::{Arc, Weak}, + time::{Duration, Instant}, +}; + +use crate::{ + metrics::SECONDARY_MODE, + tenant::{ + config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, + secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant, + }, +}; + +use md5; +use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; + +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::instrument; +use utils::{backoff, completion::Barrier}; + +use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand}; + +/// Period between heatmap uploader walking Tenants to look for work to do. +/// If any tenants have a heatmap upload period lower than this, it will be adjusted +/// downward to match. +const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000); +const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000); + +struct WriteInProgress { + barrier: Barrier, +} + +struct UploadPending { + tenant: Arc, + last_digest: Option, +} + +struct WriteComplete { + tenant_shard_id: TenantShardId, + completed_at: Instant, + digest: Option, + next_upload: Option, +} + +/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember +/// when we last did a write. We only populate this after doing at least one +/// write for a tenant -- this avoids holding state for tenants that have +/// uploads disabled. + +struct UploaderTenantState { + // This Weak only exists to enable culling idle instances of this type + // when the Tenant has been deallocated. + tenant: Weak, + + /// Digest of the serialized heatmap that we last successfully uploaded + /// + /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, + /// which is also an md5sum. + last_digest: Option, + + /// When the last upload attempt completed (may have been successful or failed) + last_upload: Option, + + /// When should we next do an upload? None means never. + next_upload: Option, +} + +/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event +/// handling loop and mutates it as needed: there are no locks here, because that event loop +/// can hold &mut references to this type throughout. +struct HeatmapUploader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + cancel: CancellationToken, + + tenants: HashMap, + + /// Tenants with work to do, for which tasks should be spawned as soon as concurrency + /// limits permit it. + tenants_pending: std::collections::VecDeque, + + /// Tenants for which a task in `tasks` has been spawned. + tenants_uploading: HashMap, + + tasks: JoinSet<()>, + + /// Channel for our child tasks to send results to: we use a channel for results rather than + /// just getting task results via JoinSet because we need the channel's recv() "sleep until something + /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty" + /// behavior. + task_result_tx: tokio::sync::mpsc::UnboundedSender, + task_result_rx: tokio::sync::mpsc::UnboundedReceiver, + + concurrent_uploads: usize, + + scheduling_interval: Duration, +} + +/// The uploader task runs a loop that periodically wakes up and schedules tasks for +/// tenants that require an upload, or handles any commands that have been sent into +/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we +/// spawn. +/// +/// Scheduling iterations are somewhat infrequent. However, each one will enqueue +/// all tenants that require an upload, and in between scheduling iterations we will +/// continue to spawn new tasks for pending tenants, as our concurrency limit permits. +/// +/// While we take a CancellationToken here, it is subordinate to the CancellationTokens +/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise +/// we might block waiting on a Tenant. +pub(super) async fn heatmap_uploader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + mut command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency; + + let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + + let mut uploader = HeatmapUploader { + tenant_manager, + remote_storage, + cancel: cancel.clone(), + tasks: JoinSet::new(), + tenants: HashMap::new(), + tenants_pending: std::collections::VecDeque::new(), + tenants_uploading: HashMap::new(), + task_result_tx: result_tx, + task_result_rx: result_rx, + concurrent_uploads, + scheduling_interval: DEFAULT_SCHEDULING_INTERVAL, + }; + + tracing::info!("Waiting for background_jobs_can start..."); + background_jobs_can_start.wait().await; + tracing::info!("background_jobs_can is ready, proceeding."); + + while !cancel.is_cancelled() { + // Look for new work: this is relatively expensive because we have to go acquire the lock on + // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones + // require an upload. + uploader.schedule_iteration().await?; + + // Between scheduling iterations, we will: + // - Drain any complete tasks and spawn pending tasks + // - Handle incoming administrative commands + // - Check our cancellation token + let next_scheduling_iteration = Instant::now() + .checked_add(uploader.scheduling_interval) + .unwrap_or_else(|| { + tracing::warn!( + "Scheduling interval invalid ({}s), running immediately!", + uploader.scheduling_interval.as_secs_f64() + ); + Instant::now() + }); + loop { + tokio::select! { + _ = cancel.cancelled() => { + // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. + tracing::info!("Heatmap uploader joining tasks"); + while let Some(_r) = uploader.tasks.join_next().await {}; + tracing::info!("Heatmap uploader terminating"); + + break; + }, + _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { + tracing::debug!("heatmap_uploader_task: woke for scheduling interval"); + break;}, + cmd = command_queue.recv() => { + tracing::debug!("heatmap_uploader_task: woke for command queue"); + let cmd = match cmd { + Some(c) =>c, + None => { + // SecondaryController was destroyed, and this has raced with + // our CancellationToken + tracing::info!("Heatmap uploader terminating"); + cancel.cancel(); + break; + } + }; + + let CommandRequest{ + response_tx, + payload + } = cmd; + uploader.handle_command(payload, response_tx); + }, + _ = uploader.process_next_completion() => { + if !cancel.is_cancelled() { + uploader.spawn_pending(); + } + } + } + } + } + + Ok(()) +} + +impl HeatmapUploader { + /// Periodic execution phase: inspect all attached tenants and schedule any work they require. + async fn schedule_iteration(&mut self) -> anyhow::Result<()> { + // Cull any entries in self.tenants whose Arc is gone + self.tenants + .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some()); + + // The priority order of previously scheduled work may be invalidated by current state: drop + // all pending work (it will be re-scheduled if still needed) + self.tenants_pending.clear(); + + // Used a fixed 'now' through the following loop, for efficiency and fairness. + let now = Instant::now(); + + // While iterating over the potentially-long list of tenants, we will periodically yield + // to avoid blocking executor. + const YIELD_ITERATIONS: usize = 1000; + + // Iterate over tenants looking for work to do. + let tenants = self.tenant_manager.get_attached_active_tenant_shards(); + for (i, tenant) in tenants.into_iter().enumerate() { + // Process is shutting down, drop out + if self.cancel.is_cancelled() { + return Ok(()); + } + + // Skip tenants that already have a write in flight + if self + .tenants_uploading + .contains_key(tenant.get_tenant_shard_id()) + { + continue; + } + + self.maybe_schedule_upload(&now, tenant); + + if i + 1 % YIELD_ITERATIONS == 0 { + tokio::task::yield_now().await; + } + } + + // Spawn tasks for as many of our pending tenants as we can. + self.spawn_pending(); + + Ok(()) + } + + /// + /// Cancellation: this method is cancel-safe. + async fn process_next_completion(&mut self) { + match self.task_result_rx.recv().await { + Some(r) => { + self.on_completion(r); + } + None => { + unreachable!("Result sender is stored on Self"); + } + } + } + + /// The 'maybe' refers to the tenant's state: whether it is configured + /// for heatmap uploads at all, and whether sufficient time has passed + /// since the last upload. + fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc) { + match tenant.get_heatmap_period() { + None => { + // Heatmaps are disabled for this tenant + return; + } + Some(period) => { + // If any tenant has asked for uploads more frequent than our scheduling interval, + // reduce it to match so that we can keep up. This is mainly useful in testing, where + // we may set rather short intervals. + if period < self.scheduling_interval { + self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL); + } + } + } + + // Stale attachments do not upload anything: if we are in this state, there is probably some + // other attachment in mode Single or Multi running on another pageserver, and we don't + // want to thrash and overwrite their heatmap uploads. + if tenant.get_attach_mode() == AttachmentMode::Stale { + return; + } + + // Create an entry in self.tenants if one doesn't already exist: this will later be updated + // with the completion time in on_completion. + let state = self + .tenants + .entry(*tenant.get_tenant_shard_id()) + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(Instant::now()), + last_digest: None, + }); + + // Decline to do the upload if insufficient time has passed + if state.next_upload.map(|nu| &nu > now).unwrap_or(false) { + return; + } + + let last_digest = state.last_digest; + self.tenants_pending.push_back(UploadPending { + tenant, + last_digest, + }) + } + + fn spawn_pending(&mut self) { + while !self.tenants_pending.is_empty() + && self.tenants_uploading.len() < self.concurrent_uploads + { + // unwrap: loop condition includes !is_empty() + let pending = self.tenants_pending.pop_front().unwrap(); + self.spawn_upload(pending.tenant, pending.last_digest); + } + } + + fn spawn_upload(&mut self, tenant: Arc, last_digest: Option) { + let remote_storage = self.remote_storage.clone(); + let tenant_shard_id = *tenant.get_tenant_shard_id(); + let (completion, barrier) = utils::completion::channel(); + let result_tx = self.task_result_tx.clone(); + self.tasks.spawn(async move { + // Guard for the barrier in [`WriteInProgress`] + let _completion = completion; + + let started_at = Instant::now(); + let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { + Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let duration = Instant::now().duration_since(started_at); + SECONDARY_MODE + .upload_heatmap_duration + .observe(duration.as_secs_f64()); + SECONDARY_MODE.upload_heatmap.inc(); + Some(digest) + } + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Err(UploadHeatmapError::Upload(e)) => { + tracing::warn!( + "Failed to upload heatmap for tenant {}: {e:#}", + tenant.get_tenant_shard_id(), + ); + let duration = Instant::now().duration_since(started_at); + SECONDARY_MODE + .upload_heatmap_duration + .observe(duration.as_secs_f64()); + SECONDARY_MODE.upload_heatmap_errors.inc(); + last_digest + } + Err(UploadHeatmapError::Cancelled) => { + tracing::info!("Cancelled heatmap upload, shutting down"); + last_digest + } + }; + + let now = Instant::now(); + let next_upload = tenant + .get_heatmap_period() + .and_then(|period| now.checked_add(period)); + + result_tx + .send(WriteComplete { + tenant_shard_id: *tenant.get_tenant_shard_id(), + completed_at: now, + digest, + next_upload, + }) + .ok(); + }); + + self.tenants_uploading + .insert(tenant_shard_id, WriteInProgress { barrier }); + } + + #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))] + fn on_completion(&mut self, completion: WriteComplete) { + tracing::debug!("Heatmap upload completed"); + let WriteComplete { + tenant_shard_id, + completed_at, + digest, + next_upload, + } = completion; + self.tenants_uploading.remove(&tenant_shard_id); + use std::collections::hash_map::Entry; + match self.tenants.entry(tenant_shard_id) { + Entry::Vacant(_) => { + // Tenant state was dropped, nothing to update. + } + Entry::Occupied(mut entry) => { + entry.get_mut().last_upload = Some(completed_at); + entry.get_mut().last_digest = digest; + entry.get_mut().next_upload = next_upload + } + } + } + + fn handle_command( + &mut self, + command: UploadCommand, + response_tx: tokio::sync::oneshot::Sender, + ) { + match command { + UploadCommand::Upload(tenant_shard_id) => { + // If an upload was ongoing for this tenant, let it finish first. + let barrier = if let Some(writing_state) = + self.tenants_uploading.get(&tenant_shard_id) + { + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Waiting for heatmap write to complete"); + writing_state.barrier.clone() + } else { + // Spawn the upload then immediately wait for it. This will block processing of other commands and + // starting of other background work. + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Starting heatmap write on command"); + let tenant = match self + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, true) + { + Ok(t) => t, + Err(e) => { + // Drop result of send: we don't care if caller dropped their receiver + drop(response_tx.send(CommandResponse { + result: Err(e.into()), + })); + return; + } + }; + self.spawn_upload(tenant, None); + let writing_state = self + .tenants_uploading + .get(&tenant_shard_id) + .expect("We just inserted this"); + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Waiting for heatmap upload to complete"); + + writing_state.barrier.clone() + }; + + // This task does no I/O: it only listens for a barrier's completion and then + // sends to the command response channel. It is therefore safe to spawn this without + // any gates/task_mgr hooks. + tokio::task::spawn(async move { + barrier.wait().await; + + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Heatmap upload complete"); + + // Drop result of send: we don't care if caller dropped their receiver + drop(response_tx.send(CommandResponse { result: Ok(()) })) + }); + } + } + } +} + +enum UploadHeatmapOutcome { + /// We successfully wrote to remote storage, with this digest. + Uploaded(md5::Digest), + /// We did not upload because the heatmap digest was unchanged since the last upload + NoChange, + /// We skipped the upload for some reason, such as tenant/timeline not ready + Skipped, +} + +#[derive(thiserror::Error, Debug)] +enum UploadHeatmapError { + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Upload(#[from] anyhow::Error), +} + +/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest +/// of the object we would have uploaded. +#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))] +async fn upload_tenant_heatmap( + remote_storage: GenericRemoteStorage, + tenant: &Arc, + last_digest: Option, +) -> Result { + debug_assert_current_span_has_tenant_id(); + + let generation = tenant.get_generation(); + if generation.is_none() { + // We do not expect this: generations were implemented before heatmap uploads. However, + // handle it so that we don't have to make the generation in the heatmap an Option<> + // (Generation::none is not serializable) + tracing::warn!("Skipping heatmap upload for tenant with generation==None"); + return Ok(UploadHeatmapOutcome::Skipped); + } + + let mut heatmap = HeatMapTenant { + timelines: Vec::new(), + generation, + }; + let timelines = tenant.timelines.lock().unwrap().clone(); + + let tenant_cancel = tenant.cancel.clone(); + + // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise + // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind + // in remote storage. + let _guard = match tenant.gate.enter() { + Ok(g) => g, + Err(_) => { + tracing::info!("Skipping heatmap upload for tenant which is shutting down"); + return Err(UploadHeatmapError::Cancelled); + } + }; + + for (timeline_id, timeline) in timelines { + let heatmap_timeline = timeline.generate_heatmap().await; + match heatmap_timeline { + None => { + tracing::debug!( + "Skipping heatmap upload because timeline {timeline_id} is not ready" + ); + return Ok(UploadHeatmapOutcome::Skipped); + } + Some(heatmap_timeline) => { + heatmap.timelines.push(heatmap_timeline); + } + } + } + + // Serialize the heatmap + let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; + let size = bytes.len(); + + // Drop out early if nothing changed since our last upload + let digest = md5::compute(&bytes); + if Some(digest) == last_digest { + return Ok(UploadHeatmapOutcome::NoChange); + } + + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); + + // Write the heatmap. + tracing::debug!("Uploading {size} byte heatmap to {path}"); + if let Err(e) = backoff::retry( + || async { + let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from( + bytes.clone(), + )))); + remote_storage + .upload_storage_object(bytes, size, &path) + .await + }, + |_| false, + 3, + u32::MAX, + "Uploading heatmap", + backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")), + ) + .await + { + if tenant_cancel.is_cancelled() { + return Err(UploadHeatmapError::Cancelled); + } else { + return Err(e.into()); + } + } + + tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); + + Ok(UploadHeatmapOutcome::Uploaded(digest)) +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 112128ead8..a4b102c314 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -457,6 +457,8 @@ struct LayerInner { /// For loaded layers, this may be some other value if the tenant has undergone /// a shard split since the layer was originally written. shard: ShardIndex, + + last_evicted_at: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -587,6 +589,7 @@ impl LayerInner { consecutive_failures: AtomicUsize::new(0), generation, shard, + last_evicted_at: std::sync::Mutex::default(), } } @@ -722,6 +725,14 @@ impl LayerInner { permit }; + let since_last_eviction = + self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + // FIXME: this will not always be recorded correctly until #6028 (the no + // download needed branch above) + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + let res = Arc::new(DownloadedLayer { owner: Arc::downgrade(self), kind: tokio::sync::OnceCell::default(), @@ -851,6 +862,7 @@ impl LayerInner { let result = client.download_layer_file( &this.desc.filename(), &this.metadata(), + &crate::task_mgr::shutdown_token() ) .await; @@ -1117,6 +1129,8 @@ impl LayerInner { // we are still holding the permit, so no new spawn_download_and_wait can happen drop(self.status.send(Status::Evicted)); + *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); + res } @@ -1421,6 +1435,7 @@ pub(crate) struct LayerImplMetrics { rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, + redownload_after: metrics::Histogram, } impl Default for LayerImplMetrics { @@ -1496,6 +1511,26 @@ impl Default for LayerImplMetrics { ) .unwrap(); + let redownload_after = { + let minute = 60.0; + let hour = 60.0 * minute; + metrics::register_histogram!( + "pageserver_layer_redownloaded_after", + "Time between evicting and re-downloading.", + vec![ + 10.0, + 30.0, + minute, + 5.0 * minute, + 15.0 * minute, + 30.0 * minute, + hour, + 12.0 * hour, + ] + ) + .unwrap() + }; + Self { started_evictions, completed_evictions, @@ -1507,6 +1542,7 @@ impl Default for LayerImplMetrics { rare_counters, inits_cancelled, + redownload_after, } } } @@ -1574,6 +1610,10 @@ impl LayerImplMetrics { fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } + + fn record_redownloaded_after(&self, duration: std::time::Duration) { + self.redownload_after.observe(duration.as_secs_f64()) + } } #[derive(enum_map::Enum)] diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index dc23030218..7ff1873eda 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -54,31 +54,18 @@ impl BackgroundLoopKind { } } -pub(crate) enum RateLimitError { - Cancelled, -} - -pub(crate) async fn concurrent_background_tasks_rate_limit( +/// Cancellation safe. +pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, - cancel: &CancellationToken, -) -> Result { - crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT +) -> impl Drop { + let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE .with_label_values(&[loop_kind.as_static_str()]) - .inc(); - scopeguard::defer!( - crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc(); - ); - tokio::select! { - permit = CONCURRENT_BACKGROUND_TASKS.acquire() => { - match permit { - Ok(permit) => Ok(permit), - Err(_closed) => unreachable!("we never close the semaphore"), - } - }, - _ = cancel.cancelled() => { - Err(RateLimitError::Cancelled) - } + .guard(); + + match CONCURRENT_BACKGROUND_TASKS.acquire().await { + Ok(permit) => permit, + Err(_closed) => unreachable!("we never close the semaphore"), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 81dbc04793..1e84fa1848 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -29,7 +29,7 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{id::TenantTimelineId, sync::gate::Gate}; +use utils::sync::gate::Gate; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; @@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{ LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, ValueReconstructState, }; -use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError}; +use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -98,8 +98,9 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; -use super::remote_timeline_client::index::IndexPart; +use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart}; use super::remote_timeline_client::RemoteTimelineClient; +use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -377,9 +378,6 @@ pub enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), - /// The operation would require downloading a layer that is missing locally. - NeedsDownload(TenantTimelineId, LayerFileName), - /// The operation was cancelled Cancelled, @@ -408,14 +406,6 @@ impl std::fmt::Debug for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped") @@ -429,14 +419,6 @@ impl std::fmt::Display for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped") @@ -464,6 +446,12 @@ pub(crate) enum CompactFlags { ForceRepartition, } +impl std::fmt::Debug for Timeline { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Timeline<{}>", self.timeline_id) + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -727,19 +715,27 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<(), CompactionError> { - let _g = self.compaction_lock.lock().await; + // most likely the cancellation token is from background task, but in tests it could be the + // request task as well. + + let prepare = async move { + let guard = self.compaction_lock.lock().await; + + let permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::Compaction, + ctx, + ) + .await; + + (guard, permit) + }; // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. - let _permit = match super::tasks::concurrent_background_tasks_rate_limit( - BackgroundLoopKind::Compaction, - ctx, - cancel, - ) - .await - { - Ok(permit) => permit, - Err(RateLimitError::Cancelled) => return Ok(()), + let (_guard, _permit) = tokio::select! { + tuple = prepare => { tuple }, + _ = self.cancel.cancelled() => return Ok(()), + _ = cancel.cancelled() => return Ok(()), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1118,8 +1114,9 @@ impl Timeline { Ok(Some(true)) } - /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer. - /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. + /// Evict just one layer. + /// + /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { let _gate = self .gate @@ -1130,109 +1127,17 @@ impl Timeline { return Ok(None); }; - let Some(local_layer) = local_layer.keep_resident().await? else { - return Ok(Some(false)); - }; - - let local_layer: Layer = local_layer.into(); - - let remote_client = self + let rtc = self .remote_client .as_ref() .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?; - let results = self - .evict_layer_batch(remote_client, &[local_layer]) - .await?; - assert_eq!(results.len(), 1); - let result: Option> = results.into_iter().next().unwrap(); - match result { - None => anyhow::bail!("task_mgr shutdown requested"), - Some(Ok(())) => Ok(Some(true)), - Some(Err(e)) => Err(anyhow::Error::new(e)), + match local_layer.evict_and_wait(rtc).await { + Ok(()) => Ok(Some(true)), + Err(EvictionError::NotFound) => Ok(Some(false)), + Err(EvictionError::Downloaded) => Ok(Some(false)), } } - - /// Evict a batch of layers. - pub(crate) async fn evict_layers( - &self, - layers_to_evict: &[Layer], - ) -> anyhow::Result>>> { - let _gate = self - .gate - .enter() - .map_err(|_| anyhow::anyhow!("Shutting down"))?; - - let remote_client = self - .remote_client - .as_ref() - .context("timeline must have RemoteTimelineClient")?; - - self.evict_layer_batch(remote_client, layers_to_evict).await - } - - /// Evict multiple layers at once, continuing through errors. - /// - /// The `remote_client` should be this timeline's `self.remote_client`. - /// We make the caller provide it so that they are responsible for handling the case - /// where someone wants to evict the layer but no remote storage is configured. - /// - /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`. - /// If `Err()` is returned, no eviction was attempted. - /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`. - /// Meaning of each `result[i]`: - /// - `Some(Err(...))` if layer replacement failed for some reason - /// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks) - /// - `Some(Ok(()))` if everything went well. - /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`. - async fn evict_layer_batch( - &self, - remote_client: &Arc, - layers_to_evict: &[Layer], - ) -> anyhow::Result>>> { - { - // to avoid racing with detach and delete_timeline - let state = self.current_state(); - anyhow::ensure!( - state == TimelineState::Active, - "timeline is not active but {state:?}" - ); - } - - let mut results = Vec::with_capacity(layers_to_evict.len()); - for _ in 0..layers_to_evict.len() { - results.push(None); - } - - let mut js = tokio::task::JoinSet::new(); - - for (i, l) in layers_to_evict.iter().enumerate() { - js.spawn({ - let l = l.to_owned(); - let remote_client = remote_client.clone(); - async move { (i, l.evict_and_wait(&remote_client).await) } - }); - } - - let join = async { - while let Some(next) = js.join_next().await { - match next { - Ok((i, res)) => results[i] = Some(res), - Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { /* already logged */ } - Err(je) => tracing::error!("unknown JoinError: {je:?}"), - } - } - }; - - tokio::select! { - _ = self.cancel.cancelled() => {}, - _ = join => {} - } - - assert_eq!(results.len(), layers_to_evict.len()); - Ok(results) - } } /// Number of times we will compute partition within a checkpoint distance. @@ -1829,6 +1734,7 @@ impl Timeline { self.current_logical_size.current_size().accuracy(), logical_size::Accuracy::Exact, ); + self.current_logical_size.initialized.add_permits(1); return; }; @@ -1874,6 +1780,11 @@ impl Timeline { cancel: CancellationToken, background_ctx: RequestContext, ) { + scopeguard::defer! { + // Irrespective of the outcome of this operation, we should unblock anyone waiting for it. + self.current_logical_size.initialized.add_permits(1); + } + enum BackgroundCalculationError { Cancelled, Other(anyhow::Error), @@ -1885,22 +1796,22 @@ impl Timeline { let skip_concurrency_limiter = &skip_concurrency_limiter; async move { let cancel = task_mgr::shutdown_token(); - let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit( + let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, - &cancel, ); use crate::metrics::initial_logical_size::StartCircumstances; let (_maybe_permit, circumstances) = tokio::select! { - res = wait_for_permit => { - match res { - Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit), - Err(RateLimitError::Cancelled) => { - return Err(BackgroundCalculationError::Cancelled); - } - } + permit = wait_for_permit => { + (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) } + _ = self_ref.cancel.cancelled() => { + return Err(BackgroundCalculationError::Cancelled); + } + _ = cancel.cancelled() => { + return Err(BackgroundCalculationError::Cancelled); + }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size // => break out of the rate limit @@ -2165,6 +2076,55 @@ impl Timeline { None } + + /// The timeline heatmap is a hint to secondary locations from the primary location, + /// indicating which layers are currently on-disk on the primary. + /// + /// None is returned if the Timeline is in a state where uploading a heatmap + /// doesn't make sense, such as shutting down or initializing. The caller + /// should treat this as a cue to simply skip doing any heatmap uploading + /// for this timeline. + pub(crate) async fn generate_heatmap(&self) -> Option { + let eviction_info = self.get_local_layers_for_disk_usage_eviction().await; + + let remote_client = match &self.remote_client { + Some(c) => c, + None => return None, + }; + + let layer_file_names = eviction_info + .resident_layers + .iter() + .map(|l| l.layer.layer_desc().filename()) + .collect::>(); + + let decorated = match remote_client.get_layers_metadata(layer_file_names) { + Ok(d) => d, + Err(_) => { + // Getting metadata only fails on Timeline in bad state. + return None; + } + }; + + let heatmap_layers = std::iter::zip( + eviction_info.resident_layers.into_iter(), + decorated.into_iter(), + ) + .filter_map(|(layer, remote_info)| { + remote_info.map(|remote_info| { + HeatMapLayer::new( + layer.layer.layer_desc().filename(), + IndexLayerMetadata::from(remote_info), + layer.last_activity_ts, + ) + }) + }); + + Some(HeatMapTimeline::new( + self.timeline_id, + heatmap_layers.collect(), + )) + } } type TraversalId = String; @@ -3150,6 +3110,32 @@ impl Timeline { Ok(image_layers) } + + /// Wait until the background initial logical size calculation is complete, or + /// this Timeline is shut down. Calling this function will cause the initial + /// logical size calculation to skip waiting for the background jobs barrier. + pub(crate) async fn await_initial_logical_size(self: Arc) { + if let Some(await_bg_cancel) = self + .current_logical_size + .cancel_wait_for_background_loop_concurrency_limit_semaphore + .get() + { + await_bg_cancel.cancel(); + } else { + // We should not wait if we were not able to explicitly instruct + // the logical size cancellation to skip the concurrency limit semaphore. + // TODO: this is an unexpected case. We should restructure so that it + // can't happen. + tracing::info!( + "await_initial_logical_size: can't get semaphore cancel token, skipping" + ); + } + + tokio::select!( + _ = self.current_logical_size.initialized.acquire() => {}, + _ = self.cancel.cancelled() => {} + ) + } } #[derive(Default)] @@ -3906,7 +3892,14 @@ impl Timeline { /// within a layer file. We can only remove the whole file if it's fully /// obsolete. pub(super) async fn gc(&self) -> anyhow::Result { - let _g = self.gc_lock.lock().await; + // this is most likely the background tasks, but it might be the spawned task from + // immediate_gc + let cancel = crate::task_mgr::shutdown_token(); + let _g = tokio::select! { + guard = self.gc_lock.lock() => guard, + _ = self.cancel.cancelled() => return Ok(GcResult::default()), + _ = cancel.cancelled() => return Ok(GcResult::default()), + }; let timer = self.metrics.garbage_collect_histo.start_timer(); fail_point!("before-timeline-gc"); @@ -4605,7 +4598,7 @@ mod tests { .await .unwrap(); - let rc = timeline + let rtc = timeline .remote_client .clone() .expect("just configured this"); @@ -4618,16 +4611,12 @@ mod tests { .expect("should had been resident") .drop_eviction_guard(); - let batch = [layer]; - - let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() }; - let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() }; + let first = async { layer.evict_and_wait(&rtc).await }; + let second = async { layer.evict_and_wait(&rtc).await }; let (first, second) = tokio::join!(first, second); - let (first, second) = (only_one(first), only_one(second)); - - let res = batch[0].keep_resident().await; + let res = layer.keep_resident().await; assert!(matches!(res, Ok(None)), "{res:?}"); match (first, second) { @@ -4648,14 +4637,6 @@ mod tests { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } - fn only_one(mut input: Vec>) -> T { - assert_eq!(1, input.len()); - input - .pop() - .expect("length just checked") - .expect("no cancellation") - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 020c5a9e9f..ea5f5f5fa7 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,7 +30,7 @@ use crate::{ task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, - tasks::{BackgroundLoopKind, RateLimitError}, + tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -158,15 +158,15 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); - let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit( + let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::Eviction, ctx, - cancel, - ) - .await - { - Ok(permit) => permit, - Err(RateLimitError::Cancelled) => return ControlFlow::Break(()), + ); + + let _permit = tokio::select! { + permit = acquire_permit => permit, + _ = cancel.cancelled() => return ControlFlow::Break(()), + _ = self.cancel.cancelled() => return ControlFlow::Break(()), }; // If we evict layers but keep cached values derived from those layers, then @@ -212,11 +212,21 @@ impl Timeline { // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. // We don't want to hold the layer map lock during eviction. + // So, we just need to deal with this. - let candidates: Vec<_> = { + + let remote_client = match self.remote_client.as_ref() { + Some(c) => c, + None => { + error!("no remote storage configured, cannot evict layers"); + return ControlFlow::Continue(()); + } + }; + + let mut js = tokio::task::JoinSet::new(); + { let guard = self.layers.read().await; let layers = guard.layer_map(); - let mut candidates = Vec::new(); for hist_layer in layers.iter_historic_layers() { let hist_layer = guard.get_from_desc(&hist_layer); @@ -262,54 +272,49 @@ impl Timeline { continue; } }; + let layer = guard.drop_eviction_guard(); if no_activity_for > p.threshold { - candidates.push(guard.drop_eviction_guard()) + let remote_client = remote_client.clone(); + // this could cause a lot of allocations in some cases + js.spawn(async move { layer.evict_and_wait(&remote_client).await }); + stats.candidates += 1; } } - candidates - }; - stats.candidates = candidates.len(); - - let remote_client = match self.remote_client.as_ref() { - None => { - error!( - num_candidates = candidates.len(), - "no remote storage configured, cannot evict layers" - ); - return ControlFlow::Continue(()); - } - Some(c) => c, }; - let results = match self.evict_layer_batch(remote_client, &candidates).await { - Err(pre_err) => { - stats.errors += candidates.len(); - error!("could not do any evictions: {pre_err:#}"); - return ControlFlow::Continue(()); + let join_all = async move { + while let Some(next) = js.join_next().await { + match next { + Ok(Ok(())) => stats.evicted += 1, + Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { + stats.not_evictable += 1; + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + /* already logged */ + stats.errors += 1; + } + Err(je) => tracing::error!("unknown JoinError: {je:?}"), + } } - Ok(results) => results, + stats }; - assert_eq!(results.len(), candidates.len()); - for result in results { - match result { - None => { - stats.skipped_for_shutdown += 1; - } - Some(Ok(())) => { - stats.evicted += 1; - } - Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { - stats.not_evictable += 1; + + tokio::select! { + stats = join_all => { + if stats.candidates == stats.not_evictable { + debug!(stats=?stats, "eviction iteration complete"); + } else if stats.errors > 0 || stats.not_evictable > 0 { + warn!(stats=?stats, "eviction iteration complete"); + } else { + info!(stats=?stats, "eviction iteration complete"); } } + _ = cancel.cancelled() => { + // just drop the joinset to "abort" + } } - if stats.candidates == stats.not_evictable { - debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { - warn!(stats=?stats, "eviction iteration complete"); - } else { - info!(stats=?stats, "eviction iteration complete"); - } + ControlFlow::Continue(()) } diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index f2db8c91fc..03bc59ea38 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -34,6 +34,9 @@ pub(super) struct LogicalSize { pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell, + /// Once the initial logical size is initialized, this is notified. + pub(crate) initialized: tokio::sync::Semaphore, + /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. pub initial_part_end: Option, @@ -125,6 +128,7 @@ impl LogicalSize { initial_part_end: None, size_added_after_initial: AtomicI64::new(0), did_return_approximate_to_walreceiver: AtomicBool::new(false), + initialized: tokio::sync::Semaphore::new(0), } } @@ -135,6 +139,7 @@ impl LogicalSize { initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), did_return_approximate_to_walreceiver: AtomicBool::new(false), + initialized: tokio::sync::Semaphore::new(0), } } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 61130f541a..27d6fd9c28 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -19,14 +19,14 @@ use super::Timeline; pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark)>, + raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark)>, + raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, ) -> Self { Self { owning_tenant, @@ -169,18 +169,55 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { /// /// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. #[must_use] -pub(crate) struct TimelineUninitMark { +pub(crate) struct TimelineUninitMark<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, uninit_mark_deleted: bool, uninit_mark_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf, } -impl TimelineUninitMark { - pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self { - Self { - uninit_mark_deleted: false, - uninit_mark_path, - timeline_path, +/// Errors when acquiring exclusive access to a timeline ID for creation +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineExclusionError { + #[error("Already exists")] + AlreadyExists(Arc), + #[error("Already creating")] + AlreadyCreating, + + // e.g. I/O errors, or some failure deep in postgres initdb + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl<'t> TimelineUninitMark<'t> { + pub(crate) fn new( + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + uninit_mark_path: Utf8PathBuf, + timeline_path: Utf8PathBuf, + ) -> Result { + // Lock order: this is the only place we take both locks. During drop() we only + // lock creating_timelines + let timelines = owning_tenant.timelines.lock().unwrap(); + let mut creating_timelines: std::sync::MutexGuard< + '_, + std::collections::HashSet, + > = owning_tenant.timelines_creating.lock().unwrap(); + + if let Some(existing) = timelines.get(&timeline_id) { + Err(TimelineExclusionError::AlreadyExists(existing.clone())) + } else if creating_timelines.contains(&timeline_id) { + Err(TimelineExclusionError::AlreadyCreating) + } else { + creating_timelines.insert(timeline_id); + Ok(Self { + owning_tenant, + timeline_id, + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + }) } } @@ -207,7 +244,7 @@ impl TimelineUninitMark { } } -impl Drop for TimelineUninitMark { +impl Drop for TimelineUninitMark<'_> { fn drop(&mut self) { if !self.uninit_mark_deleted { if self.timeline_path.exists() { @@ -226,5 +263,11 @@ impl Drop for TimelineUninitMark { } } } + + self.owning_tenant + .timelines_creating + .lock() + .unwrap() + .remove(&self.timeline_id); } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 738216afa5..16b245c488 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -2191,7 +2191,7 @@ mod tests { .load() .await; let tline = tenant - .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx) + .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx) .await .unwrap(); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 1fa2d5599f..ae4c42bcb1 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -7,6 +7,8 @@ use proxy::console; use proxy::console::provider::AllowedIpsCache; use proxy::console::provider::NodeInfoCache; use proxy::http; +use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; use proxy::usage_metrics; @@ -14,6 +16,7 @@ use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; use std::pin::pin; +use std::sync::Arc; use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; use tokio::task::JoinSet; @@ -113,8 +116,11 @@ struct ProxyCliArgs { #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. - #[clap(long, default_value_t = 300)] - endpoint_rps_limit: u32, + /// + /// Provided in the form '@'. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + endpoint_rps_limit: Vec, /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. #[clap(long, default_value_t = 100)] initial_limit: usize, @@ -157,6 +163,8 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -164,6 +172,7 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -177,6 +186,7 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), + endpoint_rate_limiter.clone(), )); } @@ -311,6 +321,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, }; + + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -320,8 +334,35 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, - endpoint_rps_limit: args.endpoint_rps_limit, + endpoint_rps_limit, })); Ok(config) } + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use clap::Parser; + use proxy::rate_limiter::RateBucketInfo; + + #[test] + fn parse_endpoint_rps_limit() { + let config = super::ProxyCliArgs::parse_from([ + "proxy", + "--endpoint-rps-limit", + "100@1s", + "--endpoint-rps-limit", + "20@30s", + ]); + + assert_eq!( + config.endpoint_rps_limit, + vec![ + RateBucketInfo::new(100, Duration::from_secs(1)), + RateBucketInfo::new(20, Duration::from_secs(30)), + ] + ); + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 78c56300a5..f5f7270bf4 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,13 @@ use crate::{ - auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - error::UserFacingError, proxy::neon_option, + auth::parse_endpoint_param, + cancellation::CancelClosure, + console::errors::WakeComputeError, + error::UserFacingError, + proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE}, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; +use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -223,6 +227,8 @@ pub struct PostgresConnection { pub params: std::collections::HashMap, /// Query cancellation token. pub cancel_closure: CancelClosure, + + _guage: IntCounterPairGuard, } impl ConnCfg { @@ -231,6 +237,7 @@ impl ConnCfg { &self, allow_self_signed_compute: bool, timeout: Duration, + proto: &'static str, ) -> Result { let (socket_addr, stream, host) = self.connect_raw(timeout).await?; @@ -264,6 +271,7 @@ impl ConnCfg { stream, params, cancel_closure, + _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index dea446eb22..f932df4058 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use crate::auth; +use crate::{auth, rate_limiter::RateBucketInfo}; use anyhow::{bail, ensure, Context, Ok}; use rustls::{sign, Certificate, PrivateKey}; use sha2::{Digest, Sha256}; @@ -20,7 +20,7 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, - pub endpoint_rps_limit: u32, + pub endpoint_rps_limit: Vec, } #[derive(Debug)] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index ae8b294841..da65065179 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -17,7 +17,10 @@ use anyhow::{bail, Context}; use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec}; +use metrics::{ + exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, + IntCounterPairVec, IntCounterVec, +}; use once_cell::sync::{Lazy, OnceCell}; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use prometheus::{ @@ -44,17 +47,10 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( +pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "proxy_opened_db_connections_total", "Number of opened connections to a database.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( "proxy_closed_db_connections_total", "Number of closed connections to a database.", &["protocol"], @@ -62,17 +58,10 @@ pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| .unwrap() }); -pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( +pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "proxy_opened_client_connections_total", "Number of opened connections from a client.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( "proxy_closed_client_connections_total", "Number of closed connections from a client.", &["protocol"], @@ -80,17 +69,10 @@ pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy = Lazy::new .unwrap() }); -pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( +pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "proxy_accepted_connections_total", "Number of client connections accepted.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( "proxy_closed_connections_total", "Number of client connections closed.", &["protocol"], @@ -297,6 +279,7 @@ pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -308,7 +291,6 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancel_map = Arc::new(CancelMap::default()); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -428,16 +410,12 @@ pub async fn handle_client( ); let proto = mode.protocol_label(); - NUM_CLIENT_CONNECTION_OPENED_COUNTER + let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE .with_label_values(&[proto]) - .inc(); - NUM_CONNECTIONS_ACCEPTED_COUNTER + .guard(); + let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&[proto]) - .inc(); - scopeguard::defer! { - NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - } + .guard(); let tls = config.tls_config.as_ref(); @@ -584,12 +562,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg async fn connect_to_compute_once( node_info: &console::CachedNodeInfo, timeout: time::Duration, + proto: &'static str, ) -> Result { let allow_self_signed_compute = node_info.allow_self_signed_compute; node_info .config - .connect(allow_self_signed_compute, timeout) + .connect(allow_self_signed_compute, timeout, proto) .await } @@ -610,6 +589,7 @@ pub trait ConnectMechanism { pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, + pub proto: &'static str, } #[async_trait] @@ -623,7 +603,7 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { - connect_to_compute_once(node_info, timeout).await + connect_to_compute_once(node_info, timeout, self.proto).await } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -1028,7 +1008,7 @@ impl Client<'_, S> { let aux = node_info.aux.clone(); let mut node = connect_to_compute( - &TcpMechanism { params }, + &TcpMechanism { params, proto }, node_info, &extra, &creds, @@ -1037,13 +1017,6 @@ impl Client<'_, S> { .or_else(|e| stream.throw_error(e)) .await?; - NUM_DB_CONNECTIONS_OPENED_COUNTER - .with_label_values(&[proto]) - .inc(); - scopeguard::defer! { - NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc(); - } - prepare_client_connection(&node, session, &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index f40b8dbd1c..b26386d159 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,5 +3,5 @@ mod limit_algorithm; mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::EndpointRateLimiter; pub use limiter::Limiter; +pub use limiter::{EndpointRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs index c6c532ae53..2c14a54a6c 100644 --- a/proxy/src/rate_limiter/aimd.rs +++ b/proxy/src/rate_limiter/aimd.rs @@ -33,39 +33,6 @@ impl Aimd { min_utilisation_threshold: config.aimd_min_utilisation_threshold, } } - - pub fn decrease_factor(self, factor: f32) -> Self { - assert!((0.5..1.0).contains(&factor)); - Self { - decrease_factor: factor, - ..self - } - } - - pub fn increase_by(self, increase: usize) -> Self { - assert!(increase > 0); - Self { - increase_by: increase, - ..self - } - } - - pub fn with_max_limit(self, max: usize) -> Self { - assert!(max > 0); - Self { - max_limit: max, - ..self - } - } - - /// A threshold below which the limit won't be increased. 0.5 = 50%. - pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self { - assert!(min_util > 0. && min_util < 1.); - Self { - min_utilisation_threshold: min_util, - ..self - } - } } #[async_trait] diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 9d28bb67b3..8dfdfcd3db 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,16 +1,19 @@ use std::{ + collections::hash_map::RandomState, + hash::BuildHasher, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, + Arc, Mutex, }, - time::Duration, }; +use anyhow::bail; use dashmap::DashMap; -use parking_lot::Mutex; +use itertools::Itertools; +use rand::{rngs::StdRng, Rng, SeedableRng}; use smol_str::SmolStr; use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Instant}; +use tokio::time::{timeout, Duration, Instant}; use tracing::info; use super::{ @@ -29,60 +32,166 @@ use super::{ // saw SNI, before doing TLS handshake. User-side error messages in that case // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now // I went with a more expensive way that yields user-friendlier error messages. -// -// TODO: add a better bucketing here, e.g. not more than 300 requests per second, -// and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects -// are noramal during redeployments, so we should not block them. -pub struct EndpointRateLimiter { - map: DashMap>>, - max_rps: u32, +pub struct EndpointRateLimiter { + map: DashMap, Hasher>, + info: &'static [RateBucketInfo], access_count: AtomicUsize, + rand: Mutex, +} + +#[derive(Clone, Copy)] +struct RateBucket { + start: Instant, + count: u32, +} + +impl RateBucket { + fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool { + if now - self.start < info.interval { + self.count < info.max_rpi + } else { + // bucket expired, reset + self.count = 0; + self.start = now; + + true + } + } + + fn inc(&mut self) { + self.count += 1; + } +} + +#[derive(Clone, Copy, PartialEq)] +pub struct RateBucketInfo { + pub interval: Duration, + // requests per interval + pub max_rpi: u32, +} + +impl std::fmt::Display for RateBucketInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32; + write!(f, "{rps}@{}", humantime::format_duration(self.interval)) + } +} + +impl std::fmt::Debug for RateBucketInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self}") + } +} + +impl std::str::FromStr for RateBucketInfo { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let Some((max_rps, interval)) = s.split_once('@') else { + bail!("invalid rate info") + }; + let max_rps = max_rps.parse()?; + let interval = humantime::parse_duration(interval)?; + Ok(Self::new(max_rps, interval)) + } +} + +impl RateBucketInfo { + pub const DEFAULT_SET: [Self; 3] = [ + Self::new(300, Duration::from_secs(1)), + Self::new(200, Duration::from_secs(60)), + Self::new(100, Duration::from_secs(600)), + ]; + + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { + info.sort_unstable_by_key(|info| info.interval); + let invalid = info + .iter() + .tuple_windows() + .find(|(a, b)| a.max_rpi > b.max_rpi); + if let Some((a, b)) = invalid { + bail!( + "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", + b.max_rpi, + a.max_rpi, + ); + } + + Ok(()) + } + + pub const fn new(max_rps: u32, interval: Duration) -> Self { + Self { + interval, + max_rpi: max_rps * interval.as_millis() as u32 / 1000, + } + } } impl EndpointRateLimiter { - pub fn new(max_rps: u32) -> Self { + pub fn new(info: &'static [RateBucketInfo]) -> Self { + Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) + } +} + +impl EndpointRateLimiter { + fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self { + info!(buckets = ?info, "endpoint rate limiter"); Self { - map: DashMap::new(), - max_rps, + info, + map: DashMap::with_hasher_and_shard_amount(hasher, 64), access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request + rand: Mutex::new(rand), } } /// Check that number of connections to the endpoint is below `max_rps` rps. pub fn check(&self, endpoint: SmolStr) -> bool { - // do GC every 100k requests (worst case memory usage is about 10MB) - if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 { + // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. + // worst case memory usage is about: + // = 2 * 2048 * 64 * (48B + 72B) + // = 30MB + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { self.do_gc(); } - let now = chrono::Utc::now().naive_utc().time(); - let entry = self - .map - .entry(endpoint) - .or_insert_with(|| Arc::new(Mutex::new((now, 0)))); - let mut entry = entry.lock(); - let (last_time, count) = *entry; + let now = Instant::now(); + let mut entry = self.map.entry(endpoint).or_insert_with(|| { + vec![ + RateBucket { + start: now, + count: 0, + }; + self.info.len() + ] + }); - if now - last_time < chrono::Duration::seconds(1) { - if count >= self.max_rps { - return false; - } - *entry = (last_time, count + 1); - } else { - *entry = (now, 1); + let should_allow_request = entry + .iter_mut() + .zip(self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now)); + + if should_allow_request { + // only increment the bucket counts if the request will actually be accepted + entry.iter_mut().for_each(RateBucket::inc); } - true + + should_allow_request } - /// Clean the map. Simple strategy: remove all entries. At worst, we'll - /// double the effective max_rps during the cleanup. But that way deletion - /// does not aquire mutex on each entry access. + /// Clean the map. Simple strategy: remove all entries in a random shard. + /// At worst, we'll double the effective max_rps during the cleanup. + /// But that way deletion does not aquire mutex on each entry access. pub fn do_gc(&self) { info!( "cleaning up endpoint rate limiter, current size = {}", self.map.len() ); - self.map.clear(); + let n = self.map.shards().len(); + // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide + // (impossible, infact, unless we have 2048 threads) + let shard = self.rand.lock().unwrap().gen_range(0..n); + self.map.shards()[shard].write().clear(); } } @@ -124,7 +233,6 @@ pub struct Token<'t> { #[derive(Debug, Clone, Copy)] pub struct LimiterState { limit: usize, - available: usize, in_flight: usize, } @@ -302,11 +410,7 @@ impl Limiter { pub fn state(&self) -> LimiterState { let limit = self.limits.load(Ordering::Relaxed); let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { - limit, - available: limit.saturating_sub(in_flight), - in_flight, - } + LimiterState { limit, in_flight } } } @@ -319,13 +423,6 @@ impl<'t> Token<'t> { } } - #[cfg(test)] - pub fn set_latency(&mut self, latency: Duration) { - use std::ops::Sub; - - self.start = Instant::now().sub(latency); - } - pub fn forget(&mut self) { if let Some(permit) = self.permit.take() { permit.forget(); @@ -344,10 +441,6 @@ impl LimiterState { pub fn limit(&self) -> usize { self.limit } - /// The amount of concurrency available to use. - pub fn available(&self) -> usize { - self.available - } /// The number of jobs in flight. pub fn in_flight(&self) -> usize { self.in_flight @@ -395,12 +488,16 @@ impl reqwest_middleware::Middleware for Limiter { #[cfg(test)] mod tests { - use std::{pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; use futures::{task::noop_waker_ref, Future}; + use rand::SeedableRng; + use rustc_hash::FxHasher; + use smol_str::SmolStr; + use tokio::time; - use super::{Limiter, Outcome}; - use crate::rate_limiter::RateLimitAlgorithm; + use super::{EndpointRateLimiter, Limiter, Outcome}; + use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm}; #[tokio::test] async fn it_works() { @@ -509,4 +606,105 @@ mod tests { limiter.release(token1, None).await; limiter.release(token2, None).await; } + + #[test] + fn rate_bucket_rpi() { + let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5)); + assert_eq!(rate_bucket.max_rpi, 50 * 5); + + let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500)); + assert_eq!(rate_bucket.max_rpi, 50 / 2); + } + + #[test] + fn rate_bucket_parse() { + let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap(); + assert_eq!(rate_bucket.interval, Duration::from_secs(10)); + assert_eq!(rate_bucket.max_rpi, 100 * 10); + assert_eq!(rate_bucket.to_string(), "100@10s"); + + let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap(); + assert_eq!(rate_bucket.interval, Duration::from_secs(60)); + assert_eq!(rate_bucket.max_rpi, 100 * 60); + assert_eq!(rate_bucket.to_string(), "100@1m"); + } + + #[test] + fn default_rate_buckets() { + let mut defaults = RateBucketInfo::DEFAULT_SET; + RateBucketInfo::validate(&mut defaults[..]).unwrap(); + } + + #[test] + #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] + fn rate_buckets_validate() { + let mut rates: Vec = ["300@1s", "10@10s"] + .into_iter() + .map(|s| s.parse().unwrap()) + .collect(); + RateBucketInfo::validate(&mut rates).unwrap(); + } + + #[tokio::test] + async fn test_rate_limits() { + let mut rates: Vec = ["100@1s", "20@30s"] + .into_iter() + .map(|s| s.parse().unwrap()) + .collect(); + RateBucketInfo::validate(&mut rates).unwrap(); + let limiter = EndpointRateLimiter::new(Vec::leak(rates)); + + let endpoint = SmolStr::from("ep-my-endpoint-1234"); + + time::pause(); + + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + // more connections fail + assert!(!limiter.check(endpoint.clone())); + + // fail even after 500ms as it's in the same bucket + time::advance(time::Duration::from_millis(500)).await; + assert!(!limiter.check(endpoint.clone())); + + // after a full 1s, 100 requests are allowed again + time::advance(time::Duration::from_millis(500)).await; + for _ in 1..6 { + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + time::advance(time::Duration::from_millis(1000)).await; + } + + // more connections after 600 will exceed the 20rps@30s limit + assert!(!limiter.check(endpoint.clone())); + + // will still fail before the 30 second limit + time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; + assert!(!limiter.check(endpoint.clone())); + + // after the full 30 seconds, 100 requests are allowed again + time::advance(time::Duration::from_millis(1)).await; + for _ in 0..100 { + assert!(limiter.check(endpoint.clone())); + } + } + + #[tokio::test] + async fn test_rate_limits_gc() { + // fixed seeded random/hasher to ensure that the test is not flaky + let rand = rand::rngs::StdRng::from_seed([1; 32]); + let hasher = BuildHasherDefault::::default(); + + let limiter = EndpointRateLimiter::new_with_rand_and_hasher( + &RateBucketInfo::DEFAULT_SET, + rand, + hasher, + ); + for i in 0..1_000_000 { + limiter.check(format!("{i}").into()); + } + assert!(limiter.map.len() < 150_000); + } } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 92d6e2d851..870e9c1103 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -8,12 +8,13 @@ mod websocket; use anyhow::bail; use hyper::StatusCode; +use metrics::IntCounterPairGuard; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; -use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER}; +use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE; use crate::rate_limiter::EndpointRateLimiter; use crate::{cancellation::CancelMap, config::ProxyConfig}; use futures::StreamExt; @@ -38,13 +39,13 @@ pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); } let conn_pool = conn_pool::GlobalConnPool::new(config); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); // shutdown the connection pool tokio::spawn({ @@ -149,22 +150,17 @@ pub async fn task_main( struct MetricService { inner: S, + _gauge: IntCounterPairGuard, } impl MetricService { fn new(inner: S) -> MetricService { - NUM_CLIENT_CONNECTION_OPENED_COUNTER - .with_label_values(&["http"]) - .inc(); - MetricService { inner } - } -} - -impl Drop for MetricService { - fn drop(&mut self) { - NUM_CLIENT_CONNECTION_CLOSED_COUNTER - .with_label_values(&["http"]) - .inc(); + MetricService { + inner, + _gauge: NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["http"]) + .guard(), + } } } @@ -248,7 +244,7 @@ async fn request_handler( .header("Access-Control-Allow-Origin", "*") .header( "Access-Control-Allow-Headers", - "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In", + "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 4f3b31b9be..69198d79d3 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -24,10 +24,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, console, - proxy::{ - neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, - NUM_DB_CONNECTIONS_OPENED_COUNTER, - }, + proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE}, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, }; use crate::{compute, config}; @@ -477,6 +474,11 @@ async fn connect_to_compute_once( .connect_timeout(timeout) .connect(tokio_postgres::NoTls) .await?; + + let conn_gauge = NUM_DB_CONNECTIONS_GAUGE + .with_label_values(&["http"]) + .guard(); + tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); let (tx, mut rx) = tokio::sync::watch::channel(session); @@ -492,10 +494,7 @@ async fn connect_to_compute_once( tokio::spawn( async move { - NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc(); - scopeguard::defer! { - NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc(); - } + let _conn_gauge = conn_gauge; poll_fn(move |cx| { if matches!(rx.has_changed(), Ok(true)) { session = *rx.borrow_and_update(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 6e80260193..795ba819c1 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -29,7 +29,7 @@ use utils::http::error::ApiError; use utils::http::json::json_response; use crate::config::HttpConfig; -use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER}; +use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; @@ -303,12 +303,9 @@ async fn handle_inner( session_id: uuid::Uuid, peer_addr: IpAddr, ) -> anyhow::Result> { - NUM_CONNECTIONS_ACCEPTED_COUNTER + let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&["http"]) - .inc(); - scopeguard::defer! { - NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc(); - } + .guard(); // // Determine the destination and connection params diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index cd6184cdee..071add3bca 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -27,15 +27,15 @@ use sync_wrapper::SyncWrapper; pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub struct WebSocketRw { #[pin] - stream: SyncWrapper>, + stream: SyncWrapper>, bytes: Bytes, } } -impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { +impl WebSocketRw { + pub fn new(stream: WebSocketStream) -> Self { Self { stream: stream.into(), bytes: Bytes::new(), @@ -43,7 +43,7 @@ impl WebSocketRw { } } -impl AsyncWrite for WebSocketRw { +impl AsyncWrite for WebSocketRw { fn poll_write( self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -69,7 +69,7 @@ impl AsyncWrite for WebSocketRw { } } -impl AsyncRead for WebSocketRw { +impl AsyncRead for WebSocketRw { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -86,7 +86,7 @@ impl AsyncRead for WebSocketRw { } } -impl AsyncBufRead for WebSocketRw { +impl AsyncBufRead for WebSocketRw { fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { // Please refer to poll_fill_buf's documentation. const EOF: Poll> = Poll::Ready(Ok(&[])); @@ -151,3 +151,60 @@ pub async fn serve_websocket( .await?; Ok(()) } + +#[cfg(test)] +mod tests { + use std::pin::pin; + + use futures::{SinkExt, StreamExt}; + use hyper_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; + use tokio::{ + io::{duplex, AsyncReadExt, AsyncWriteExt}, + task::JoinSet, + }; + + use super::WebSocketRw; + + #[tokio::test] + async fn websocket_stream_wrapper_happy_path() { + let (stream1, stream2) = duplex(1024); + + let mut js = JoinSet::new(); + + js.spawn(async move { + let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await; + + client + .send(Message::Binary(b"hello world".to_vec())) + .await + .unwrap(); + + let message = client.next().await.unwrap().unwrap(); + assert_eq!(message, Message::Binary(b"websockets are cool".to_vec())); + + client.close(None).await.unwrap(); + }); + + js.spawn(async move { + let mut rw = pin!(WebSocketRw::new( + WebSocketStream::from_raw_socket(stream2, Role::Server, None).await + )); + + let mut buf = vec![0; 1024]; + let n = rw.read(&mut buf).await.unwrap(); + assert_eq!(&buf[..n], b"hello world"); + + rw.write_all(b"websockets are cool").await.unwrap(); + rw.flush().await.unwrap(); + + let n = rw.read_to_end(&mut buf).await.unwrap(); + assert_eq!(n, 0); + }); + + js.join_next().await.unwrap().unwrap(); + js.join_next().await.unwrap().unwrap(); + } +} diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index e26f2c6d6b..fdae378d55 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -31,6 +31,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls" aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] } pageserver = { path = "../pageserver" } +pageserver_api = { path = "../libs/pageserver_api" } remote_storage = { path = "../libs/remote_storage" } tracing.workspace = true diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index a15a908212..2acbb2352b 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -7,13 +7,12 @@ use utils::generation::Generation; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget}; +use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::{pin_mut, StreamExt}; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; -use utils::id::TenantTimelineId; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -39,8 +38,8 @@ impl TimelineAnalysis { } } -pub(crate) async fn branch_cleanup_and_check_errors( - id: &TenantTimelineId, +pub(crate) fn branch_cleanup_and_check_errors( + id: &TenantShardTimelineId, s3_root: &RootTarget, s3_active_branch: Option<&BranchData>, console_branch: Option, @@ -238,7 +237,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St pub(crate) async fn list_timeline_blobs( s3_client: &Client, - id: TenantTimelineId, + id: TenantShardTimelineId, s3_root: &RootTarget, ) -> anyhow::Result { let mut s3_layers = HashSet::new(); diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index f27e1d7f65..7192afb91b 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -10,15 +10,16 @@ use aws_sdk_s3::{ Client, }; use futures_util::{pin_mut, TryStreamExt}; +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; -use utils::id::{TenantId, TenantTimelineId}; +use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] @@ -29,8 +30,8 @@ enum GarbageReason { #[derive(Serialize, Deserialize, Debug)] enum GarbageEntity { - Tenant(TenantId), - Timeline(TenantTimelineId), + Tenant(TenantShardId), + Timeline(TenantShardTimelineId), } #[derive(Serialize, Deserialize, Debug)] @@ -142,6 +143,9 @@ async fn find_garbage_inner( console_projects.len() ); + // TODO(sharding): batch calls into Console so that we only call once for each TenantId, + // rather than checking the same TenantId for multiple TenantShardId + // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); let tenants = stream_tenants(&s3_client, &target); @@ -149,10 +153,10 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); let console_projects = &console_projects; async move { - match console_projects.get(&t) { + match console_projects.get(&t.tenant_id) { Some(project_data) => Ok((t, Some(project_data.clone()))), None => api_client - .find_tenant_project(t) + .find_tenant_project(t.tenant_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (t, r)), @@ -166,21 +170,21 @@ async fn find_garbage_inner( // checks if they are enabled by the `depth` parameter. pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); - let mut active_tenants: Vec = vec![]; + let mut active_tenants: Vec = vec![]; let mut counter = 0; while let Some(result) = tenants_checked.next().await { - let (tenant_id, console_result) = result?; + let (tenant_shard_id, console_result) = result?; // Paranoia check if let Some(project) = &console_result { - assert!(project.tenant == tenant_id); + assert!(project.tenant == tenant_shard_id.tenant_id); } - if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) { - tracing::debug!("Tenant {tenant_id} is garbage"); + if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) { + tracing::debug!("Tenant {tenant_shard_id} is garbage"); } else { - tracing::debug!("Tenant {tenant_id} is active"); - active_tenants.push(tenant_id); + tracing::debug!("Tenant {tenant_shard_id} is active"); + active_tenants.push(tenant_shard_id); } counter += 1; @@ -266,13 +270,13 @@ impl std::fmt::Display for PurgeMode { pub async fn get_tenant_objects( s3_client: &Arc, target: RootTarget, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> anyhow::Result> { - tracing::debug!("Listing objects in tenant {tenant_id}"); + tracing::debug!("Listing objects in tenant {tenant_shard_id}"); // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_id); + let mut tenant_root = target.tenant_root(&tenant_shard_id); // Remove delimiter, so that object listing lists all keys in the prefix and not just // common prefixes. @@ -285,7 +289,7 @@ pub async fn get_tenant_objects( pub async fn get_timeline_objects( s3_client: &Arc, target: RootTarget, - ttid: TenantTimelineId, + ttid: TenantShardTimelineId, ) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); let mut timeline_root = target.timeline_root(&ttid); diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 6607db21e6..d2338c21e5 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -22,6 +22,7 @@ use aws_sdk_s3::{Client, Config}; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver_api::shard::TenantShardId; use reqwest::Url; use serde::{Deserialize, Serialize}; use std::io::IsTerminal; @@ -29,7 +30,7 @@ use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::{TenantId, TenantTimelineId}; +use utils::id::TimelineId; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -44,6 +45,35 @@ pub struct S3Target { pub delimiter: String, } +/// Convenience for referring to timelines within a particular shard: more ergonomic +/// than using a 2-tuple. +/// +/// This is the shard-aware equivalent of TenantTimelineId. It's defined here rather +/// than somewhere more broadly exposed, because this kind of thing is rarely needed +/// in the pageserver, as all timeline objects existing in the scope of a particular +/// tenant: the scrubber is different in that it handles collections of data referring to many +/// TenantShardTimelineIds in on place. +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub struct TenantShardTimelineId { + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, +} + +impl TenantShardTimelineId { + fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self { + Self { + tenant_shard_id, + timeline_id, + } + } +} + +impl Display for TenantShardTimelineId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id) + } +} + #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] pub enum TraversingDepth { Tenant, @@ -110,19 +140,19 @@ impl RootTarget { } } - pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target { + pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { self.tenants_root().with_sub_segment(&tenant_id.to_string()) } - pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target { + pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { match self { Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), Self::Safekeeper(_) => self.tenant_root(tenant_id), } } - pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target { - self.timelines_root(&id.tenant_id) + pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target { + self.timelines_root(&id.tenant_shard_id) .with_sub_segment(&id.timeline_id.to_string()) } diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs index 4cfa77cfc1..073f37f319 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/s3_scrubber/src/metadata_stream.rs @@ -3,14 +3,15 @@ use async_stream::{stream, try_stream}; use aws_sdk_s3::{types::ObjectIdentifier, Client}; use tokio_stream::Stream; -use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId}; -use utils::id::{TenantTimelineId, TimelineId}; +use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; +use pageserver_api::shard::TenantShardId; +use utils::id::TimelineId; /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( s3_client: &'a Client, target: &'a RootTarget, -) -> impl Stream> + 'a { +) -> impl Stream> + 'a { try_stream! { let mut continuation_token = None; let tenants_target = target.tenants_root(); @@ -44,14 +45,14 @@ pub fn stream_tenants<'a>( } } -/// Given a TenantId, output a stream of the timelines within that tenant, discovered +/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered /// using ListObjectsv2. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. pub async fn stream_tenant_timelines<'a>( s3_client: &'a Client, target: &'a RootTarget, - tenant: TenantId, -) -> anyhow::Result> + 'a> { + tenant: TenantShardId, +) -> anyhow::Result> + 'a> { let mut timeline_ids: Vec> = Vec::new(); let mut continuation_token = None; let timelines_target = target.timelines_root(&tenant); @@ -98,7 +99,7 @@ pub async fn stream_tenant_timelines<'a>( Ok(stream! { for i in timeline_ids { let id = i?; - yield Ok(TenantTimelineId::new(tenant, id)); + yield Ok(TenantShardTimelineId::new(tenant, id)); } }) } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 228f8d6763..91347ca21b 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -5,20 +5,19 @@ use crate::checks::{ TimelineAnalysis, }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{pin_mut, StreamExt, TryStreamExt}; use histogram::Histogram; use pageserver::tenant::IndexPart; use serde::Serialize; -use utils::id::TenantTimelineId; #[derive(Serialize)] pub struct MetadataSummary { count: usize, - with_errors: HashSet, - with_warnings: HashSet, - with_garbage: HashSet, + with_errors: HashSet, + with_warnings: HashSet, + with_garbage: HashSet, indices_by_version: HashMap, layer_count: MinMaxHisto, @@ -132,7 +131,7 @@ impl MetadataSummary { } } - fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) { + fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { if !analysis.errors.is_empty() { self.with_errors.insert(*id); } @@ -199,8 +198,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> { + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { let data = list_timeline_blobs(s3_client, ttid, target).await?; Ok((ttid, data)) } @@ -213,8 +212,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result postgres_backend::Handler let cmd = parse_cmd(query_string)?; let cmd_str = cmd_to_string(&cmd); - PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); - scopeguard::defer! { - PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); - } + let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); info!("got query {:?}", query_string); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 0711beb290..11a3f48922 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -11,7 +11,8 @@ use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, + IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -89,16 +90,10 @@ pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_broker_pulled_updates_total counter") }); -pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { - register_int_counter_vec!( +pub static PG_QUERIES_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "safekeeper_pg_queries_received_total", "Number of queries received through pg protocol", - &["query"] - ) - .expect("Failed to register safekeeper_pg_queries_received_total counter") -}); -pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { - register_int_counter_vec!( "safekeeper_pg_queries_finished_total", "Number of queries finished through pg protocol", &["query"] diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4b23650960..42e122cefe 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1870,11 +1870,12 @@ class NeonPageserver(PgProtocol): tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None, auth_token: Optional[str] = None, + generation: Optional[int] = None, ) -> TenantId: + if generation is None: + generation = self.maybe_get_generation(tenant_id) client = self.http_client(auth_token=auth_token) - return client.tenant_create( - tenant_id, conf, generation=self.maybe_get_generation(tenant_id) - ) + return client.tenant_create(tenant_id, conf, generation=generation) def tenant_load(self, tenant_id: TenantId): client = self.http_client() @@ -2944,7 +2945,7 @@ class Safekeeper: tli_dir = self.timeline_dir(tenant_id, timeline_id) segments = [] for _, _, filenames in os.walk(tli_dir): - segments.extend([f for f in filenames if f != "safekeeper.control"]) + segments.extend([f for f in filenames if not f.startswith("safekeeper.control")]) segments.sort() return segments diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 5c3ae3ce4b..74c6bddf23 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -79,6 +79,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these # and it is not a failure of our code when it happens. ".*DeleteObjects.*We encountered an internal error. Please try again.*", + # During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this + # up is tracked in https://github.com/neondatabase/neon/issues/6096 + ".*Cancelled, shutting down.*", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index b46ddf5527..eda8813c36 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -322,6 +322,10 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return TenantConfig.from_json(res.json()) + def tenant_heatmap_upload(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") + self.verbose_error(res) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 824531bea4..c0c2383feb 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -16,6 +16,7 @@ from fixtures.log_helper import log from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" +TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @enum.unique @@ -133,6 +134,13 @@ class LocalFsStorage: with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) + def heatmap_path(self, tenant_id: TenantId) -> Path: + return self.tenant_path(tenant_id) / TENANT_HEATMAP_FILE_NAME + + def heatmap_content(self, tenant_id): + with self.heatmap_path(tenant_id).open("r") as f: + return json.load(f) + def to_toml_inline_table(self) -> str: rv = { "local_path": str(self.root), diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index a146e011cc..a2a1fa11e5 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -55,9 +55,20 @@ def measure_recovery_time(env: NeonCompare): # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. + # + # This is a "weird" thing to do, and can confuse the attachment service as we're re-using + # the same tenant ID for a tenant that is logically different from the pageserver's point + # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, + # we will explicitly create the tenant in the same generation that it was previously + # attached in. + assert env.env.attachment_service is not None + attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant) + assert attach_status is not None + (attach_gen, _) = attach_status + client.tenant_delete(env.tenant) wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5) - env.env.pageserver.tenant_create(tenant_id=env.tenant) + env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen) # Measure recovery time with env.record_duration("wal_recovery"): diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 70d386a566..352ec13884 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -52,7 +52,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N TenantId(t["id"]) for t in ps_http.tenant_list() ], "tenant should not be attached after negative test" - env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request") + env.pageserver.allowed_errors.extend( + [ + # This fixture detaches the tenant, and tests using it will tend to re-attach it + # shortly after. There may be un-processed deletion_queue validations from the + # initial attachment + ".*Dropped remote consistent LSN updates.*", + # This fixture is for tests that will intentionally generate 400 responses + ".*Error processing HTTP request: Bad request", + ] + ) def log_contains_bad_request(): env.pageserver.log_contains(".*Error processing HTTP request: Bad request") @@ -163,6 +172,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", + "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 82ca985d01..9a0b91b54e 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,8 +1,7 @@ import random import threading import time -from queue import SimpleQueue -from typing import Any, Dict, List, Union +from typing import List import pytest from fixtures.log_helper import log @@ -239,92 +238,6 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder t.join() -def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder): - """ - If the activate only after upload is used, then retries could become competing. - """ - - env = neon_env_builder.init_configs() - env.start() - - env.pageserver.allowed_errors.extend( - [ - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", - ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory", - ] - ) - ps_http = env.pageserver.http_client() - - # pause all uploads - ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) - env.pageserver.tenant_create(env.initial_tenant) - - def start_creating_timeline(): - ps_http.timeline_create( - env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 - ) - - create_root = threading.Thread(target=start_creating_timeline) - - branch_id = TimelineId.generate() - - queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue() - barrier = threading.Barrier(3) - - def try_branch(): - barrier.wait() - barrier.wait() - try: - ret = ps_http.timeline_create( - env.pg_version, - env.initial_tenant, - branch_id, - ancestor_timeline_id=env.initial_timeline, - timeout=5, - ) - queue.put(ret) - except Exception as e: - queue.put(e) - - threads = [threading.Thread(target=try_branch) for _ in range(2)] - - try: - create_root.start() - - for t in threads: - t.start() - - wait_until_paused(env, "before-upload-index-pausable") - - barrier.wait() - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) - barrier.wait() - - # now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files - first = queue.get() - second = queue.get() - - log.info(first) - log.info(second) - - (succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first) - assert isinstance(failed, Exception) - assert isinstance(succeeded, Dict) - - # there's multiple valid status codes: - # - Timeline x/y already exists - # - whatever 409 response says, but that is a subclass of PageserverApiException - assert isinstance(failed, PageserverApiException) - assert succeeded["state"] == "Active" - finally: - # we might still have the failpoint active - env.pageserver.stop(immediate=True) - - for t in threads: - t.join() - create_root.join() - - def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder): """ Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 3f5de100fd..5a9c2782e6 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -273,9 +273,24 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r timeline_id = env.initial_timeline pg_version = env.pg_version - shutil.rmtree(repo_dir / "local_fs_remote_storage") + # Delete all files from local_fs_remote_storage except initdb.tar.zst, + # the file is required for `timeline_create` with `existing_initdb_timeline_id`. + # + # TODO: switch to Path.walk() in Python 3.12 + # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk(): + for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"): + for filename in filenames: + if filename != "initdb.tar.zst": + (Path(dirpath) / filename).unlink() + timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) - pageserver_http.timeline_create(pg_version, tenant_id, timeline_id) + pageserver_http.timeline_create( + pg_version=pg_version, + tenant_id=tenant_id, + new_timeline_id=timeline_id, + existing_initdb_timeline_id=timeline_id, + ) + pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index b14b7f1328..64ade346aa 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -4,7 +4,7 @@ from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload @@ -330,3 +330,46 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) + + +def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): + """ + Test the sequence of location states that are used in a live migration. + """ + env = neon_env_builder.init_start() # initial_tenant_conf=TENANT_CONF) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Write some data so that we have some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + + # Write some layers and upload a heatmap + workload.write_rows(256, env.pageservers[0].id) + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + def validate_heatmap(heatmap): + assert len(heatmap["timelines"]) == 1 + assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id) + assert len(heatmap["timelines"][0]["layers"]) > 0 + layers = heatmap["timelines"][0]["layers"] + + # Each layer appears at most once + assert len(set(layer["name"] for layer in layers)) == len(layers) + + # Download and inspect the heatmap that the pageserver uploaded + heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.info(f"Read back heatmap: {heatmap_first}") + validate_heatmap(heatmap_first) + + # Do some more I/O to generate more layers + workload.churn_rows(64, env.pageservers[0].id) + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + # Ensure that another heatmap upload includes the new layers + heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.info(f"Read back heatmap: {heatmap_second}") + assert heatmap_second != heatmap_first + validate_heatmap(heatmap_second) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 24cbe34457..6e510b2eba 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -300,7 +300,8 @@ def test_timeline_initial_logical_size_calculation_cancellation( env = neon_env_builder.init_start() client = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) @@ -732,3 +733,142 @@ def wait_for_timeline_size_init( raise Exception( f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" ) + + +def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): + """ + Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete + before proceeding. However, they skip this if a client is actively trying to access them. + + This test is not purely about logical sizes, but logical size calculation is the phase that we + use as a proxy for "warming up" in this test: it happens within the semaphore guard used + to limit concurrent tenant warm-up. + """ + + # We will run with the limit set to 1, so that once we have one tenant stuck + # in a pausable failpoint, the rest are prevented from proceeding through warmup. + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Create some tenants + n_tenants = 10 + tenant_ids = {env.initial_tenant} + for _i in range(0, n_tenants - 1): + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + + # Empty tenants are not subject to waiting for logical size calculations, because + # those hapen on timeline level + timeline_id = TimelineId.generate() + env.neon_cli.create_timeline( + new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id + ) + + tenant_ids.add(tenant_id) + + # Restart pageserver with logical size calculations paused + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + def get_tenant_states(): + states = {} + for tenant_id in tenant_ids: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + states[tenant_id] = tenant["state"]["slug"] + log.info(f"Tenant states: {states}") + return states + + def at_least_one_active(): + assert "Active" in set(get_tenant_states().values()) + + # One tenant should activate, then get stuck in their logical size calculation + wait_until(10, 1, at_least_one_active) + + # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate + time.sleep(5) + + # We should see one tenant win the activation race, and enter logical size calculation. The rest + # will stay in Attaching state, waiting for the "warmup_limit" semaphore + expect_activated = 1 + states = get_tenant_states() + assert len([s for s in states.values() if s == "Active"]) == expect_activated + assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + + # This is zero, and subsequent checks are expect_activated - 1, because this counter does not + # count how may tenants are Active, it counts how many have finished warmup. The first tenant + # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup. + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0 + + # If a client accesses one of the blocked tenants, it should skip waiting for warmup and + # go active as fast as it can. + stuck_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + + endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id) + endpoint.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10) g", + ] + ) + endpoint.stop() + + # That one that we successfully accessed is now Active + expect_activated += 1 + assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) + + # The ones we didn't touch are still in Attaching + assert ( + len([s for s in get_tenant_states().values() if s == "Attaching"]) + == n_tenants - expect_activated + ) + + # Timeline creation operations also wake up Attaching tenants + stuck_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate()) + expect_activated += 1 + assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" + assert ( + len([s for s in get_tenant_states().values() if s == "Attaching"]) + == n_tenants - expect_activated + ) + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) + + # When we unblock logical size calculation, all tenants should proceed to active state via + # the warmup route. + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + def all_active(): + assert all(s == "Active" for s in get_tenant_states().values()) + + wait_until(10, 1, all_active) + + # Final control check: restarting with no failpoints at all results in all tenants coming active + # without being prompted by client I/O + env.pageserver.stop() + env.pageserver.start() + wait_until(10, 1, all_active) + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 4a9ffeee4b..7d03f644d1 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -1,7 +1,6 @@ import sys import tarfile import tempfile -import time from pathlib import Path import pytest @@ -12,6 +11,7 @@ from fixtures.neon_fixtures import ( PgBin, VanillaPostgres, ) +from fixtures.pageserver.utils import timeline_delete_wait_completed from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage from fixtures.types import Lsn, TenantId, TimelineId @@ -128,10 +128,7 @@ def test_wal_restore_initdb( assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] -def test_wal_restore_http( - neon_env_builder: NeonEnvBuilder, - test_output_dir: Path, -): +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") endpoint.safe_psql("create table t as select generate_series(1,300000)") @@ -145,15 +142,7 @@ def test_wal_restore_http( assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - test_output_dir / "initdb.tar.zst" - - (env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst") - - ps_client.timeline_delete(tenant_id, timeline_id) - time.sleep(2) - - # verify that it is indeed deleted - # TODO + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) # issue the restoration command ps_client.timeline_create( diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index dd067cf656..0bb356aa0c 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit dd067cf656f6810a25aca6025633d32d02c5085a +Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index bc88f53931..24333abb81 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit bc88f539312fcc4bb292ce94ae9db09ab6656e8a +Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index e3a22b7292..863b71572b 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit e3a22b72922055f9212eca12700190f118578362 +Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb diff --git a/vendor/revisions.json b/vendor/revisions.json index c4cea208ee..a9575a2cb7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "e3a22b72922055f9212eca12700190f118578362", - "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a", - "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a" + "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb", + "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7", + "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 6f0ebe5f66..804405293f 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -34,7 +34,7 @@ files: server_tls_sslmode=disable pool_mode=transaction max_client_conn=10000 - default_pool_size=16 + default_pool_size=64 max_prepared_statements=0 - filename: cgconfig.conf content: |