From 4d55d61807ab0b6bd2c1f698bd3386cb1d1784ab Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 2 May 2023 20:36:11 +0300 Subject: [PATCH 01/19] Store basic endpoint info in endpoint.json file. (#4058) It's more convenient than parsing the postgresql.conf file. Extracted from PR #3886. I started working on another patch (to make it safe to run two "neon_local endpoint create" commands concurrently), and realized that this change will make that simpler too. --- control_plane/src/bin/neon_local.rs | 41 +++++------ control_plane/src/endpoint.rs | 107 ++++++++++++++-------------- 2 files changed, 69 insertions(+), 79 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 09278e1726..0e0d71b3f1 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint::Replication; +use control_plane::endpoint::ComputeMode; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -481,7 +481,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - timeline_id, None, pg_version, - Replication::Primary, + ComputeMode::Primary, )?; println!("Done"); } @@ -568,8 +568,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match endpoint.replication { - Replication::Static(lsn) => { + let lsn_str = match endpoint.mode { + ComputeMode::Static(lsn) => { // -> read-only endpoint // Use the node's LSN. lsn.to_string() @@ -632,21 +632,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .unwrap_or(false); - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - cplane.new_endpoint( - tenant_id, - &endpoint_id, - timeline_id, - port, - pg_version, - replication, - )?; + cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); @@ -670,11 +663,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .unwrap_or(false); if let Some(endpoint) = endpoint { - match (&endpoint.replication, hot_standby) { - (Replication::Static(_), true) => { + match (&endpoint.mode, hot_standby) { + (ComputeMode::Static(_), true) => { bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") } - (Replication::Primary, true) => { + (ComputeMode::Primary, true) => { bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") } _ => {} @@ -701,10 +694,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .context("Failed to `pg-version` from the argument string")?; - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; @@ -721,7 +714,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( timeline_id, port, pg_version, - replication, + mode, )?; ep.start(&auth_token)?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7d3485518f..5a1f93dc99 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -11,15 +11,31 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; -use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +// contents of a endpoint.json file +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EndpointConf { + name: String, + #[serde_as(as = "DisplayFromStr")] + tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + timeline_id: TimelineId, + mode: ComputeMode, + port: u16, + pg_version: u32, +} + // // ComputeControlPlane // @@ -70,7 +86,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, port: Option, pg_version: u32, - replication: Replication, + mode: ComputeMode, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); @@ -80,12 +96,22 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, - replication, + mode, tenant_id, pg_version, }); - ep.create_pgdata()?; + std::fs::write( + ep.endpoint_path().join("endpoint.json"), + serde_json::to_string_pretty(&EndpointConf { + name: name.to_string(), + tenant_id, + timeline_id, + mode, + port, + pg_version, + })?, + )?; ep.setup_pg_conf()?; self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); @@ -96,12 +122,13 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// -#[derive(Debug, Clone, Eq, PartialEq)] -pub enum Replication { +#[serde_as] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] +pub enum ComputeMode { // Regular read-write node Primary, // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN - Static(Lsn), + Static(#[serde_as(as = "DisplayFromStr")] Lsn), // Hot standby; read-only replica. // Future versions may want to distinguish between replicas with hot standby // feedback and other kinds of replication configurations. @@ -115,7 +142,7 @@ pub struct Endpoint { pub tenant_id: TenantId, pub timeline_id: TimelineId, // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. - pub replication: Replication, + pub mode: ComputeMode, // port and address of the Postgres server pub address: SocketAddr, @@ -144,50 +171,20 @@ impl Endpoint { let fname = entry.file_name(); let name = fname.to_str().unwrap().to_string(); - // Read config file into memory - let cfg_path = entry.path().join("pgdata").join("postgresql.conf"); - let cfg_path_str = cfg_path.to_string_lossy(); - let mut conf_file = File::open(&cfg_path) - .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; - let conf = PostgresConf::read(&mut conf_file) - .with_context(|| format!("failed to read config file in {}", cfg_path_str))?; - - // Read a few options from the config file - let context = format!("in config file {}", cfg_path_str); - let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - - // Read postgres version from PG_VERSION file to determine which postgres version binary to use. - // If it doesn't exist, assume broken data directory and use default pg version. - let pg_version_path = entry.path().join("PG_VERSION"); - - let pg_version_str = - fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); - let pg_version = u32::from_str(&pg_version_str)?; - - // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any - let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") { - Replication::Static(Lsn::from_str(lsn_str)?) - } else if let Some(slot_name) = conf.get("primary_slot_name") { - let slot_name = slot_name.to_string(); - let prefix = format!("repl_{}_", timeline_id); - assert!(slot_name.starts_with(&prefix)); - Replication::Replica - } else { - Replication::Primary - }; + // Read the endpoint.json file + let conf: EndpointConf = + serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; // ok now Ok(Endpoint { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), name, env: env.clone(), pageserver: Arc::clone(pageserver), - timeline_id, - replication, - tenant_id, - pg_version, + timeline_id: conf.timeline_id, + mode: conf.mode, + tenant_id: conf.tenant_id, + pg_version: conf.pg_version, }) } @@ -323,8 +320,8 @@ impl Endpoint { conf.append_line(""); // Replication-related configurations, such as WAL sending - match &self.replication { - Replication::Primary => { + match &self.mode { + ComputeMode::Primary => { // Configure backpressure // - Replication write lag depends on how fast the walreceiver can process incoming WAL. // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, @@ -366,10 +363,10 @@ impl Endpoint { conf.append("synchronous_standby_names", "pageserver"); } } - Replication::Static(lsn) => { + ComputeMode::Static(lsn) => { conf.append("recovery_target_lsn", &lsn.to_string()); } - Replication::Replica => { + ComputeMode::Replica => { assert!(!self.env.safekeepers.is_empty()); // TODO: use future host field from safekeeper spec @@ -409,8 +406,8 @@ impl Endpoint { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = match &self.replication { - Replication::Primary => { + let backup_lsn = match &self.mode { + ComputeMode::Primary => { if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap @@ -426,8 +423,8 @@ impl Endpoint { None } } - Replication::Static(lsn) => Some(*lsn), - Replication::Replica => { + ComputeMode::Static(lsn) => Some(*lsn), + ComputeMode::Replica => { None // Take the latest snapshot available to start with } }; @@ -526,7 +523,7 @@ impl Endpoint { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.replication != Replication::Primary { + if self.mode != ComputeMode::Primary { File::create(self.pgdata().join("standby.signal"))?; } From 47521693eda3066fc1569a2e8f53a8d33f50eeb5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 2 May 2023 22:28:21 +0300 Subject: [PATCH 02/19] Fix file_cache build warnings (#4118) ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- pgxn/neon/file_cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index cc46fb5a25..1ab2ae668a 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -96,6 +96,8 @@ static shmem_request_hook_type prev_shmem_request_hook; #endif static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ +void FileCacheMonitorMain(Datum main_arg); + static void lfc_shmem_startup(void) { @@ -378,7 +380,6 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; - ssize_t rc; bool found; int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); uint32 hash; From 474f69c1c06f5b1479915c37d06e20b5f5c51ddd Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 3 May 2023 10:56:49 +0300 Subject: [PATCH 03/19] fix: omit cancellation logging when panicking (#4125) noticed while describing `RequestSpan`, this fix will omit the otherwise logged message about request being cancelled when panicking in the request handler. this was missed on #4064. --- libs/utils/src/http/endpoint.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index b11aef9892..4bfb5bf994 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -131,7 +131,9 @@ impl RequestCancelled { impl Drop for RequestCancelled { fn drop(&mut self) { - if let Some(span) = self.warn.take() { + if std::thread::panicking() { + // we are unwinding due to panicking, assume we are not dropped for cancellation + } else if let Some(span) = self.warn.take() { // the span has all of the info already, but the outer `.instrument(span)` has already // been dropped, so we need to manually re-enter it for this message. // From faebe3177b5234d8e1ebcebd7ca702fc68795923 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 3 May 2023 11:11:55 +0300 Subject: [PATCH 04/19] wal_craft: cleanup to keep editable (#4121) wal_craft had accumulated some trouble by using `use anyhow::*;`. Fixes that, removes redundant conversions (never need to convert a Path to OsStr), especially at the `Process` args. Originally in #4100 but we merged a later PR instead for the fixes. I dropped the `postmaster.pid` polling in favor of just having a longer connect timeout. --- libs/postgres_ffi/wal_craft/src/lib.rs | 52 ++++++++++++-------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 88ae41c636..9f3f4dc20d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,5 +1,4 @@ -use anyhow::*; -use core::time::Duration; +use anyhow::{bail, ensure}; use log::*; use postgres::types::PgLsn; use postgres::Client; @@ -8,7 +7,7 @@ use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; use std::path::{Path, PathBuf}; use std::process::Command; -use std::time::Instant; +use std::time::{Duration, Instant}; use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -55,7 +54,7 @@ impl Conf { self.datadir.join("pg_wal") } - fn new_pg_command(&self, command: impl AsRef) -> Result { + fn new_pg_command(&self, command: impl AsRef) -> anyhow::Result { let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); @@ -65,7 +64,7 @@ impl Conf { Ok(cmd) } - pub fn initdb(&self) -> Result<()> { + pub fn initdb(&self) -> anyhow::Result<()> { if let Some(parent) = self.datadir.parent() { info!("Pre-creating parent directory {:?}", parent); // Tests may be run concurrently and there may be a race to create `test_output/`. @@ -79,7 +78,7 @@ impl Conf { let output = self .new_pg_command("initdb")? .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); @@ -92,7 +91,7 @@ impl Conf { Ok(()) } - pub fn start_server(&self) -> Result { + pub fn start_server(&self) -> anyhow::Result { info!("Starting Postgres server in {:?}", self.datadir); let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); @@ -100,9 +99,9 @@ impl Conf { .new_pg_command("postgres")? .args(["-c", "listen_addresses="]) .arg("-k") - .arg(unix_socket_dir_path.as_os_str()) + .arg(&unix_socket_dir_path) .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .spawn()?; let server = PostgresServer { @@ -123,7 +122,7 @@ impl Conf { &self, first_segment_name: &str, last_segment_name: &str, - ) -> Result { + ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); info!( @@ -133,10 +132,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args([ - &first_segment_file.as_os_str(), - &last_segment_file.as_os_str(), - ]) + .args([&first_segment_file, &last_segment_file]) .output()?; debug!("waldump output: {:?}", output); Ok(output) @@ -144,10 +140,9 @@ impl Conf { } impl PostgresServer { - pub fn connect_with_timeout(&self) -> Result { + pub fn connect_with_timeout(&self) -> anyhow::Result { let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); while Instant::now() < retry_until { - use std::result::Result::Ok; if let Ok(client) = self.client_config.connect(postgres::NoTls) { return Ok(client); } @@ -164,7 +159,6 @@ impl PostgresServer { impl Drop for PostgresServer { fn drop(&mut self) { - use std::result::Result::Ok; match self.process.try_wait() { Ok(Some(_)) => return, Ok(None) => { @@ -179,12 +173,12 @@ impl Drop for PostgresServer { } pub trait PostgresClientExt: postgres::GenericClient { - fn pg_current_wal_insert_lsn(&mut self) -> Result { + fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_insert_lsn()", &[])? .get(0)) } - fn pg_current_wal_flush_lsn(&mut self) -> Result { + fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_flush_lsn()", &[])? .get(0)) @@ -193,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); @@ -227,13 +221,13 @@ pub trait Crafter { /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. /// May include or exclude Lsn(0) and the end-of-wal. /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; } fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, -) -> Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, +) -> anyhow::Result<(Vec, PgLsn)> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; @@ -265,7 +259,7 @@ fn craft_internal( pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok((Vec::new(), None)) @@ -276,7 +270,7 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -298,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch { pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -365,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result<(Vec, PgLsn)> { +) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -407,7 +401,7 @@ fn craft_single_logical_message( pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, true) } } @@ -415,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, false) } } From ecc0cf8cd6a3d411658f2b747a87f0ea325231dc Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 3 May 2023 12:03:13 +0300 Subject: [PATCH 05/19] Treat EPIPE as an expected error. (#4141) If the other end of a TCP connection closes its read end of the socket, you get an EPIPE when you try to send. I saw that happen in the CI once: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4136/release/4869464644/index.html#suites/c19bc2126511ef8cb145cca25c438215/7ec87b016c0b4b50/ ``` 2023-05-03T07:53:22.394152Z ERROR Task 'serving compute connection task' tenant_id: Some(c204447079e02e7ba8f593cb8bc57e76), timeline_id: Some(b666f26600e6deaa9f43e1aeee5bacb7) exited with error: Postgres connection error Caused by: Broken pipe (os error 32) Stack backtrace: 0: pageserver::page_service::page_service_conn_main::{{closure}} at /__w/neon/neon/pageserver/src/page_service.rs:282:17 as core::future::future::Future>::poll at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/panic/unwind_safe.rs:296:9 as core::future::future::Future>::poll::{{closure}} at /__w/neon/neon/.cargo/registry/src/github.com-1ecc6299db9ec823/futures-util-0.3.28/src/future/future/catch_unwind.rs:36:42 as core::ops::function::FnOnce<()>>::call_once at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/panic/unwind_safe.rs:271:9 ... ``` In the passing, add a comment to explain what the "expected" in the `is_expected_io_error` function means. --- libs/postgres_backend/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f6bf7c6fc2..453c58431a 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -50,11 +50,14 @@ impl QueryError { } } +/// Returns true if the given error is a normal consequence of a network issue, +/// or the client closing the connection. These errors can happen during normal +/// operations, and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut + BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut ) } From 39ca7c7c0954f8a9424d29700c9366482c890e80 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 May 2023 10:40:35 +0100 Subject: [PATCH 06/19] Bump flask from 2.1.3 to 2.2.5 (#4138) --- poetry.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7b368cd3b4..141371c925 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" @@ -968,14 +968,14 @@ testing = ["pre-commit"] [[package]] name = "flask" -version = "2.1.3" +version = "2.2.5" description = "A simple framework for building complex web applications." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, - {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, + {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, + {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] [package.dependencies] @@ -983,7 +983,7 @@ click = ">=8.0" importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" -Werkzeug = ">=2.0" +Werkzeug = ">=2.2.2" [package.extras] async = ["asgiref (>=3.2)"] From db81242f4a8b848d03844ffa6dbe4c25aea6e8e2 Mon Sep 17 00:00:00 2001 From: Anton Chaporgin Date: Wed, 3 May 2023 16:14:16 +0300 Subject: [PATCH 07/19] add debug to pg-sni-router install (#4143) --- .github/workflows/deploy-dev.yml | 2 +- .github/workflows/deploy-prod.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index f37e1b344d..6e77a3b571 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -274,7 +274,7 @@ jobs: - name: Deploy pg-sni-router run: - helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s - name: Cleanup helm folder run: rm -rf ~/.cache diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index c5d690db3a..baa44d8094 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -214,4 +214,4 @@ jobs: - name: Deploy pg-sni-router run: - helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s From 586e6e55f8281afa49d1504c236eb12564f361e5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 3 May 2023 16:25:19 +0300 Subject: [PATCH 08/19] Print WalReceiver context on WAL waiting timeout (#4090) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes https://github.com/neondatabase/neon/issues/2106 Before: ``` Extracting base backup to create postgres instance: path=/Users/someonetoignore/work/neon/neon_main/test_output/test_pageserver_lsn_wait_error_safekeeper_stop/repo/endpoints/ep-2/pgdata port=15017 stderr: command failed: page server 'basebackup' command failed Caused by: 0: db error: ERROR: Timed out while waiting for WAL record at LSN 0/FFFFFFFF to arrive, last_record_lsn 0/A2C3F58 disk consistent LSN=0/16B5A50 1: ERROR: Timed out while waiting for WAL record at LSN 0/FFFFFFFF to arrive, last_record_lsn 0/A2C3F58 disk consistent LSN=0/16B5A50 Stack backtrace: ``` After: ``` Extracting base backup to create postgres instance: path=/Users/someonetoignore/work/neon/neon/test_output/test_pageserver_lsn_wait_error_safekeeper_stop/repo/endpoints/ep-2/pgdata port=15011 stderr: command failed: page server 'basebackup' command failed Caused by: 0: db error: ERROR: Timed out while waiting for WAL record at LSN 0/FFFFFFFF to arrive, last_record_lsn 0/A2C3F58 disk consistent LSN=0/16B5A50, WalReceiver status (update 2023-04-26 14:20:39): streaming WAL from node 12346, commit|streaming Lsn: 0/A2C3F58|0/A2C3F58, safekeeper candidates (id|update_time|commit_lsn): [(12348|14:20:40|0/A2C3F58), (12346|14:20:40|0/A2C3F58), (12347|14:20:40|0/A2C3F58)] 1: ERROR: Timed out while waiting for WAL record at LSN 0/FFFFFFFF to arrive, last_record_lsn 0/A2C3F58 disk consistent LSN=0/16B5A50, WalReceiver status (update 2023-04-26 14:20:39): streaming WAL from node 12346, commit|streaming Lsn: 0/A2C3F58|0/A2C3F58, safekeeper candidates (id|update_time|commit_lsn): [(12348|14:20:40|0/A2C3F58), (12346|14:20:40|0/A2C3F58), (12347|14:20:40|0/A2C3F58)] Stack backtrace: ``` As the issue requests, the PR adds the context in logs only, but I think we should expose the context via HTTP management API similar way — it should be simple with the new API, but better be done in a separate PR. Co-authored-by: Kirill Bulatov --- pageserver/src/tenant/timeline.rs | 28 +++-- pageserver/src/tenant/timeline/walreceiver.rs | 26 ++-- .../walreceiver/connection_manager.rs | 96 ++++++++++++++- .../walreceiver/walreceiver_connection.rs | 8 +- test_runner/regress/test_wal_receiver.py | 115 ++++++++++++++++++ 5 files changed, 252 insertions(+), 21 deletions(-) create mode 100644 test_runner/regress/test_wal_receiver.py diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8768841d87..bc55c2091c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -588,15 +588,25 @@ impl Timeline { let _timer = self.metrics.wait_lsn_time_histo.start_timer(); - self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await - .with_context(|| - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - )?; - - Ok(()) + match self + .last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .await + { + Ok(()) => Ok(()), + seqwait_error => { + drop(_timer); + let walreceiver_status = self.walreceiver.status().await; + seqwait_error.with_context(|| format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status.map(|status| status.to_human_readable_string()) + .unwrap_or_else(|| "WalReceiver status: Not active".to_string()), + )) + } + } } /// Check that it is valid to request operations with that lsn. diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 00f446af38..91f7208194 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -38,12 +38,14 @@ use std::sync::{Arc, Weak}; use std::time::Duration; use storage_broker::BrokerClientChannel; use tokio::select; -use tokio::sync::watch; +use tokio::sync::{watch, RwLock}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; +use self::connection_manager::ConnectionManagerStatus; + use super::Timeline; #[derive(Clone)] @@ -63,6 +65,7 @@ pub struct WalReceiver { timeline_ref: Weak, conf: WalReceiverConf, started: AtomicBool, + manager_status: Arc>>, } impl WalReceiver { @@ -76,6 +79,7 @@ impl WalReceiver { timeline_ref, conf, started: AtomicBool::new(false), + manager_status: Arc::new(RwLock::new(None)), } } @@ -96,8 +100,8 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let wal_receiver_conf = self.conf.clone(); + let loop_status = Arc::clone(&self.manager_status); task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, @@ -115,24 +119,28 @@ impl WalReceiver { select! { _ = task_mgr::shutdown_watcher() => { info!("WAL receiver shutdown requested, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; }, loop_step_result = connection_manager_loop_step( &mut broker_client, &mut connection_manager_state, &walreceiver_ctx, + &loop_status, ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { info!("Connection manager loop ended, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; } }, } } - }.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + + connection_manager_state.shutdown().await; + *loop_status.write().await = None; + Ok(()) + } + .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) ); self.started.store(true, atomic::Ordering::Release); @@ -149,6 +157,10 @@ impl WalReceiver { .await; self.started.store(false, atomic::Ordering::Release); } + + pub(super) async fn status(&self) -> Option { + self.manager_status.read().await.clone() + } } /// A handle of an asynchronous task. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 731c5c4644..17c66238f2 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -24,6 +24,7 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; +use tokio::sync::RwLock; use tokio::{select, sync::watch}; use tracing::*; @@ -43,6 +44,7 @@ pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + manager_status: &RwLock>, ) -> ControlFlow<(), ()> { let mut timeline_state_updates = connection_manager_state .timeline @@ -180,6 +182,7 @@ pub(super) async fn connection_manager_loop_step( .change_connection(new_candidate, ctx) .await } + *manager_status.write().await = Some(connection_manager_state.manager_status()); } } @@ -267,6 +270,78 @@ pub(super) struct ConnectionManagerState { wal_stream_candidates: HashMap, } +/// An information about connection manager's current connection and connection candidates. +#[derive(Debug, Clone)] +pub struct ConnectionManagerStatus { + existing_connection: Option, + wal_stream_candidates: HashMap, +} + +impl ConnectionManagerStatus { + /// Generates a string, describing current connection status in a form, suitable for logging. + pub fn to_human_readable_string(&self) -> String { + let mut resulting_string = "WalReceiver status".to_string(); + match &self.existing_connection { + Some(connection) => { + if connection.has_processed_wal { + resulting_string.push_str(&format!( + " (update {}): streaming WAL from node {}, ", + connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + + match (connection.streaming_lsn, connection.commit_lsn) { + (None, None) => resulting_string.push_str("no streaming data"), + (None, Some(commit_lsn)) => { + resulting_string.push_str(&format!("commit Lsn: {commit_lsn}")) + } + (Some(streaming_lsn), None) => { + resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}")) + } + (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str( + &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"), + ), + } + } else if connection.is_connected { + resulting_string.push_str(&format!( + " (update {}): connecting to node {}", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } else { + resulting_string.push_str(&format!( + " (update {}): initializing node {} connection", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } + } + None => resulting_string.push_str(": disconnected"), + } + + resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): ["); + let mut candidates = self.wal_stream_candidates.iter().peekable(); + while let Some((node_id, candidate_info)) = candidates.next() { + resulting_string.push_str(&format!( + "({}|{}|{})", + node_id, + candidate_info.latest_update.format("%H:%M:%S"), + Lsn(candidate_info.timeline.commit_lsn) + )); + if candidates.peek().is_some() { + resulting_string.push_str(", "); + } + } + resulting_string.push(']'); + + resulting_string + } +} + /// Current connection data. #[derive(Debug)] struct WalConnection { @@ -293,14 +368,14 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] struct RetryInfo { next_retry_at: Option, retry_duration_seconds: f64, } /// Data about the timeline to connect to, received from the broker. -#[derive(Debug)] +#[derive(Debug, Clone)] struct BrokerSkTimeline { timeline: SafekeeperTimelineInfo, /// Time at which the data was fetched from the broker last time, to track the stale data. @@ -328,6 +403,7 @@ impl ConnectionManagerState { self.drop_old_connection(true).await; let id = self.id; + let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( @@ -343,12 +419,13 @@ impl ConnectionManagerState { cancellation, connect_timeout, ctx, + node_id, ) .await .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), ) }); @@ -364,6 +441,7 @@ impl ConnectionManagerState { latest_wal_update: now, streaming_lsn: None, commit_lsn: None, + node: node_id, }, connection_task: connection_handle, discovered_new_wal: None, @@ -725,6 +803,13 @@ impl ConnectionManagerState { wal_connection.connection_task.shutdown().await; } } + + fn manager_status(&self) -> ConnectionManagerStatus { + ConnectionManagerStatus { + existing_connection: self.wal_connection.as_ref().map(|conn| conn.status), + wal_stream_candidates: self.wal_stream_candidates.clone(), + } + } } #[derive(Debug)] @@ -867,6 +952,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), streaming_lsn: Some(Lsn(current_lsn)), + node: NodeId(1), }; state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); @@ -1035,6 +1121,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { @@ -1101,6 +1188,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1164,6 +1252,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1261,6 +1350,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d80c7c5673..801641a534 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -37,8 +37,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use utils::{id::NodeId, lsn::Lsn}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -56,6 +56,8 @@ pub(super) struct WalConnectionStatus { pub streaming_lsn: Option, /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. pub commit_lsn: Option, + /// The node it is connected to + pub node: NodeId, } /// Open a connection to the given safekeeper and receive WAL, sending back progress @@ -67,6 +69,7 @@ pub(super) async fn handle_walreceiver_connection( cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, + node: NodeId, ) -> anyhow::Result<()> { // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); @@ -100,6 +103,7 @@ pub(super) async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, + node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); @@ -122,7 +126,7 @@ pub(super) async fn handle_walreceiver_connection( false, async move { select! { - connection_result = connection => match connection_result{ + connection_result = connection => match connection_result { Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { if let Err(e) = ignore_expected_errors(connection_error) { diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py new file mode 100644 index 0000000000..8e4e154be1 --- /dev/null +++ b/test_runner/regress/test_wal_receiver.py @@ -0,0 +1,115 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.types import Lsn, TenantId + + +# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. +# Ensures that walreceiver does not run without any data inserted and only starts after the insertion. +def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): + # Trigger WAL wait timeout faster + neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" + env = neon_env_builder.init_start() + env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" + env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + assert ( + "WalReceiver status: Not active" in exception_string + ), "Walreceiver should not be active before any data writes" + + insert_test_elements(env, tenant_id, start=0, count=1_000) + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + assert ( + "WalReceiver status: Not active" not in exception_string + ), "Should not be inactive anymore after INSERTs are made" + assert "WalReceiver status" in exception_string, "But still should have some other status" + + +# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout. +# Kills one of the safekeepers and ensures that only the active ones are printed in the state. +def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): + # Trigger WAL wait timeout faster + neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers + neon_env_builder.safekeepers_id_start = 12345 + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + elements_to_insert = 1_000_000 + expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" + env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + + insert_test_elements(env, tenant_id, start=0, count=elements_to_insert) + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + + for safekeeper in env.safekeepers: + assert ( + str(safekeeper.id) in exception_string + ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" + + stopped_safekeeper = env.safekeepers[-1] + stopped_safekeeper_id = stopped_safekeeper.id + log.info(f"Stopping safekeeper {stopped_safekeeper.id}") + stopped_safekeeper.stop() + + # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats. + insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert) + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + + for safekeeper in env.safekeepers: + if safekeeper.id == stopped_safekeeper_id: + assert ( + str(safekeeper.id) not in exception_string + ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + else: + assert ( + str(safekeeper.id) in exception_string + ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + + +def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): + first_element_id = start + last_element_id = first_element_id + count + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + cur.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({first_element_id},{last_element_id}) as i" + ) + + +future_lsn = Lsn("0/FFFFFFFF") + + +def trigger_wait_lsn_timeout(env: NeonEnv, tenant_id: TenantId): + with env.endpoints.create_start( + "main", + tenant_id=tenant_id, + lsn=future_lsn, + ) as endpoint: + with endpoint.cursor() as cur: + cur.execute("SELECT 1") From 3ceef7b17aa2452d55a4726aeba94050f9bd551e Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 3 May 2023 17:07:41 +0300 Subject: [PATCH 09/19] Add more safekeeper and walreceiver metrics (#4142) Add essential safekeeper and pageserver::walreceiver metrics. Mostly counters, such as the number of received queries, broker messages, removed WAL segments, or connection switches events in walreceiver. Also logs broker push loop duration. --- Cargo.lock | 1 + pageserver/src/metrics.rs | 56 ++++++++++++++++++- .../walreceiver/connection_manager.rs | 31 +++++++++- .../walreceiver/walreceiver_connection.rs | 4 +- safekeeper/Cargo.toml | 1 + safekeeper/src/broker.rs | 22 +++++++- safekeeper/src/handler.rs | 17 +++++- safekeeper/src/metrics.rs | 54 +++++++++++++++++- safekeeper/src/wal_backup.rs | 9 ++- safekeeper/src/wal_storage.rs | 3 +- 10 files changed, 187 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bce2d11188..9436b591d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3674,6 +3674,7 @@ dependencies = [ "remote_storage", "reqwest", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b5d7eb0132..ec2f49c85a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec, - Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, - UIntGaugeVec, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, + UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::models::TenantState; @@ -478,6 +478,56 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +// walreceiver metrics + +pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_started_connections_total", + "Number of started walreceiver connections" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_walreceiver_active_managers", + "Number of active walreceiver managers" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_switches_total", + "Number of walreceiver manager change_connection calls", + &["reason"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_broker_updates_total", + "Number of received broker updates in walreceiver" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_candidates_events_total", + "Number of walreceiver candidate events", + &["event"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_ADDED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); + +pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 17c66238f2..9cb17ea799 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -13,6 +13,10 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti use super::{TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{ + WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, + WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, +}; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use anyhow::Context; @@ -58,6 +62,11 @@ pub(super) async fn connection_manager_loop_step( } } + WALRECEIVER_ACTIVE_MANAGERS.inc(); + scopeguard::defer! { + WALRECEIVER_ACTIVE_MANAGERS.dec(); + } + let id = TenantTimelineId { tenant_id: connection_manager_state.timeline.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, @@ -400,6 +409,10 @@ impl ConnectionManagerState { /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { + WALRECEIVER_SWITCHES + .with_label_values(&[new_sk.reason.name()]) + .inc(); + self.drop_old_connection(true).await; let id = self.id; @@ -515,6 +528,8 @@ impl ConnectionManagerState { /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + WALRECEIVER_BROKER_UPDATES.inc(); + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -526,6 +541,7 @@ impl ConnectionManagerState { if old_entry.is_none() { info!("New SK node was added: {new_safekeeper_id}"); + WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -794,6 +810,7 @@ impl ConnectionManagerState { for node_id in node_ids_to_remove { info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); self.wal_connection_retries.remove(&node_id); + WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } @@ -817,8 +834,6 @@ struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, availability_zone: Option, - // This field is used in `derive(Debug)` only. - #[allow(dead_code)] reason: ReconnectReason, } @@ -847,6 +862,18 @@ enum ReconnectReason { }, } +impl ReconnectReason { + fn name(&self) -> &str { + match self { + ReconnectReason::NoExistingConnection => "NoExistingConnection", + ReconnectReason::LaggingWal { .. } => "LaggingWal", + ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone", + ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout", + ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives", + } + } +} + fn wal_stream_connection_config( TenantTimelineId { tenant_id, diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 801641a534..1cbed3416c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -24,8 +24,8 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use super::TaskStateUpdate; -use crate::context::RequestContext; use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, node: NodeId, ) -> anyhow::Result<()> { + WALRECEIVER_STARTED_CONNECTIONS.inc(); + // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index b6e8497809..393570df6a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -25,6 +25,7 @@ parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true regex.workspace = true +scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 6a98d8fd84..5e25d22ec1 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -14,10 +14,13 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; use std::time::Duration; +use std::time::Instant; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_PULLED_UPDATES; +use crate::metrics::BROKER_PUSHED_UPDATES; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -49,12 +52,17 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. + let now = Instant::now(); let mut active_tlis = GlobalTimelines::get_all(); active_tlis.retain(|tli| tli.is_active()); for tli in &active_tlis { let sk_info = tli.get_safekeeper_info(&conf); yield sk_info; + BROKER_PUSHED_UPDATES.inc(); } + let elapsed = now.elapsed(); + // Log duration every second. Should be about 10MB of logs per day. + info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); sleep(push_interval).await; } }; @@ -79,6 +87,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { .context("subscribe_safekeper_info request failed")? .into_inner(); + let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]); + let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]); + let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); + while let Some(msg) = stream.message().await? { let proto_ttid = msg .tenant_timeline_id @@ -91,7 +103,15 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // connection to the broker. // note: there are blocking operations below, but it's considered fine for now - tli.record_safekeeper_info(msg).await? + let res = tli.record_safekeeper_info(msg).await; + if res.is_ok() { + ok_counter.inc(); + } else { + err_counter.inc(); + } + res?; + } else { + not_found.inc(); } } bail!("end of stream"); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c3d1cea0e..7d25ced449 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -10,7 +10,7 @@ use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::metrics::TrafficMetrics; +use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; @@ -72,6 +72,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } } +fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { + match cmd { + SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", + SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", + SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", + } +} + #[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler @@ -168,6 +177,12 @@ impl postgres_backend::Handler } let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); + scopeguard::defer! { + PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); + } info!( "got query {:?} in timeline {:?}", diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index eafee557d7..189af2b044 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -10,7 +10,7 @@ use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -73,6 +73,58 @@ pub static PG_IO_BYTES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_pg_io_bytes gauge") }); +pub static BROKER_PUSHED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_broker_pushed_updates_total", + "Number of timeline updates pushed to the broker" + ) + .expect("Failed to register safekeeper_broker_pushed_updates_total counter") +}); +pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_broker_pulled_updates_total", + "Number of timeline updates pulled and processed from the broker", + &["result"] + ) + .expect("Failed to register safekeeper_broker_pulled_updates_total counter") +}); +pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_received_total", + "Number of queries received through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_received_total counter") +}); +pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_finished_total", + "Number of queries finished through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_finished_total counter") +}); +pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_removed_wal_segments_total", + "Number of WAL segments removed from the disk" + ) + .expect("Failed to register safekeeper_removed_wal_segments_total counter") +}); +pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backed_up_segments_total", + "Number of WAL segments backed up to the broker" + ) + .expect("Failed to register safekeeper_backed_up_segments_total counter") +}); +pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backup_errors_total", + "Number of errors during backup" + ) + .expect("Failed to register safekeeper_backup_errors_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 163ac99be8..953c7d0022 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -25,6 +25,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; use crate::timeline::{PeerInfo, Timeline}; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -394,7 +395,13 @@ async fn backup_single_segment( ) })?; - backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?; + let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + if res.is_ok() { + BACKED_UP_SEGMENTS.inc(); + } else { + BACKUP_ERRORS.inc(); + } + res?; debug!("Backup of {} done", segment_file_path.display()); Ok(()) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5ef22b2f6a..1b82bd754e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -27,7 +27,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{time_io_closure, WalStorageMetrics}; +use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; @@ -455,6 +455,7 @@ fn remove_segments_from_disk( n_removed += 1; min_removed = min(min_removed, segno); max_removed = max(max_removed, segno); + REMOVED_WAL_SEGMENTS.inc(); } } } From b114ef26c25bc0cb7fd93de96efd2d10726096e9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 3 May 2023 15:38:49 +0100 Subject: [PATCH 10/19] GitHub Autocomment: add a note if no tests were run (#4109) - Always (if not cancelled) add a comment to a PR - Mention in the comment if no tests were run / reports were not generated. --- .github/workflows/build_and_test.yml | 5 +---- scripts/pr-comment-test-report.js | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e5ba7aa3eb..90f0395c7c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -418,10 +418,7 @@ jobs: - uses: actions/github-script@v6 if: > !cancelled() && - github.event_name == 'pull_request' && ( - steps.create-allure-report-debug.outputs.report-url || - steps.create-allure-report-release.outputs.report-url - ) + github.event_name == 'pull_request' with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index 8df7248c4e..21386d3898 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -37,7 +37,7 @@ module.exports = async ({ github, context, fetch, reports }) => { const {buildType, reportUrl, jsonUrl} = report if (!reportUrl || !jsonUrl) { - console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`) + commentBody += `#### ${buildType} build: no tests were run or test report is not available\n` continue } @@ -78,7 +78,7 @@ module.exports = async ({ github, context, fetch, reports }) => { } const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length - commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n` + commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n` if (failedTests.length > 0) { commentBody += `Failed tests:\n` for (const test of failedTests) { From ce1bbc9fa76145f4b0d7ced59ff60f9b8d25d924 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 4 May 2023 00:07:45 +0300 Subject: [PATCH 11/19] Always send the latest commit_lsn in send_wal (#4150) When a new connection is established to the safekeeper, the 'end_pos' field is initially set to Lsn::INVALID (i.e 0/0). If there is no WAL to send to the client, we send KeepAlive messages with Lsn::INVALID. That confuses the pageserver: it thinks that safekeeper is lagging very much behind the tip of the branch, and will reconnect to a different safekeeper. Then the same thing happens with the new safekeeper, until some WAL is streamed which sets 'end_pos' to a valid value. This fix always sets `end_pos` to the most recent `commit_lsn` value. This is useful to send the latest `commit_lsn` to the receiver, so it will know how advanced this safekeeper compared to the others. Fixes https://github.com/neondatabase/neon/issues/3972 Supersedes https://github.com/neondatabase/neon/pull/4144 --- safekeeper/src/send_wal.rs | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 6b303eb0fe..f502500e7d 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -384,6 +384,8 @@ impl SafekeeperPostgresHandler { self.appname.clone(), )); + let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); + // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will // hang otherwise). That's because walproposer runs the consensus and @@ -399,7 +401,14 @@ impl SafekeeperPostgresHandler { } else { None }; - let end_pos = stop_pos.unwrap_or(Lsn::INVALID); + + // take the latest commit_lsn if don't have stop_pos + let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow()); + + if end_pos < start_pos { + warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos); + end_pos = start_pos; + } info!( "starting streaming from {:?} till {:?}", @@ -429,7 +438,7 @@ impl SafekeeperPostgresHandler { start_pos, end_pos, stop_pos, - commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + commit_lsn_watch_rx, ws_guard: ws_guard.clone(), wal_reader, send_buf: [0; MAX_SEND_SIZE], @@ -456,6 +465,11 @@ struct WalSender<'a, IO> { // Position since which we are sending next chunk. start_pos: Lsn, // WAL up to this position is known to be locally available. + // Usually this is the same as the latest commit_lsn, but in case of + // walproposer recovery, this is flush_lsn. + // + // We send this LSN to the receiver as wal_end, so that it knows how much + // WAL this safekeeper has. This LSN should be as fresh as possible. end_pos: Lsn, // If present, terminate after reaching this position; used by walproposer // in recovery. @@ -530,10 +544,18 @@ impl WalSender<'_, IO> { /// exit in the meanwhile async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { + self.end_pos = *self.commit_lsn_watch_rx.borrow(); + if self.end_pos > self.start_pos { + // We have something to send. + return Ok(()); + } + + // Wait for WAL to appear, now self.end_pos == self.start_pos. if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? { self.end_pos = lsn; return Ok(()); } + // Timed out waiting for WAL, check for termination and send KA if let Some(remote_consistent_lsn) = self .ws_guard @@ -618,11 +640,6 @@ const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); /// - Ok(None) if timeout expired; /// - Err in case of error (if watch channel is in trouble, shouldn't happen). async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { - let commit_lsn: Lsn = *rx.borrow(); - if commit_lsn > lsn { - return Ok(Some(commit_lsn)); - } - let res = timeout(POLL_STATE_TIMEOUT, async move { let mut commit_lsn; loop { From f9839a0dd931181dd849eb4ee2987fcc93d5e61a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 4 May 2023 09:23:49 +0200 Subject: [PATCH 12/19] import_basebackup_from_tar: don't load local layers twice (#4111) PR #4104 removed these bits as part of a revert of a larger change. follow-up to https://github.com/neondatabase/neon/pull/4104#discussion_r1180444952 --- Let's not merge this before the release. --- pageserver/src/tenant.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5cfc466111..d69d5e4b45 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -271,7 +271,10 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - self.initialize(ctx) + // Initialize without loading the layer map. We started with an empty layer map, and already + // updated it for the layers that we created during the import. + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(ctx, &mut timelines, false, true) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -2352,6 +2355,8 @@ impl Tenant { ) })?; + // Initialize the timeline without loading the layer map, because we already updated the layer + // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? From b5d64a1e32884e618b06fc9bf051947fe1cd21df Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 4 May 2023 14:41:15 +0300 Subject: [PATCH 13/19] Rename field, to match field name in XLogData struct and in rust-postgres (#4149) The field means the same thing as the `wal_end` field in the XLogData struct. And in the postgres-protocol crate's corresponding PrimaryKeepAlive struct, it's also called `wal_end`. Let's be consistent. As noted by Arthur at https://github.com/neondatabase/neon/pull/4144#pullrequestreview-1411031881 --- libs/pq_proto/src/lib.rs | 4 ++-- safekeeper/src/send_wal.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 2143ad2530..8e361b757c 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -613,7 +613,7 @@ pub struct XLogDataBody<'a> { #[derive(Debug)] pub struct WalSndKeepAlive { - pub sent_ptr: u64, + pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub request_reply: bool, } @@ -924,7 +924,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'k'); - buf.put_u64(req.sent_ptr); + buf.put_u64(req.wal_end); buf.put_i64(req.timestamp); buf.put_u8(u8::from(req.request_reply)); }); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index f502500e7d..fb420cba64 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -573,7 +573,7 @@ impl WalSender<'_, IO> { self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: self.end_pos.0, + wal_end: self.end_pos.0, timestamp: get_current_timestamp(), request_reply: true, })) From 7dd9553bbb68f023f69b77d351ac3a66031b3d80 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 4 May 2023 16:16:48 +0200 Subject: [PATCH 14/19] eviction: regression test + distinguish layer write from map insert (#4005) This patch adds a regression test for the threshold-based layer eviction. The test asserts the basic invariant that, if left alone, the residence statuses will stabilize, with some layers resident and some layers evicted. Thereby, we cover both the aspect of last-access-time-threshold-based eviction, and the "imitate access" hacks that we put in recently. The aggressive `period` and `threshold` values revealed a subtle bug which is also fixed in this patch. The symptom was that, without the Rust changes of this patch, there would be occasional test failures due to `WARN... unexpectedly downloading` log messages. These log messages were caused by the "imitate access" calls of the eviction task. But, the whole point of the "imitate access" hack was to prevent eviction of the layers that we access there. After some digging, I found the root cause, which is the following race condition: 1. Compact: Write out an L1 layer from several L0 layers. This records residence event `LayerCreate` with the current timestamp. 2. Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. 3. Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. 4. Eviction: observes the new L1 layer whose only activity timestamp is the `LayerCreate` event. The L1 layer had no chance of being accessed until after (3). So, if enough time passes between (1) and (3), then (4) will observe a layer with `now-last_activity > threshold` and evict it The fix is to require the first `record_residence_event` to happen while we already hold the layer map lock. The API requires a ref to a `BatchedUpdates` as a witness that we are inside a layer map lock. That is not fool-proof, e.g., new call sites for `insert_historic` could just completely forget to record the residence event. It would be nice to prevent this at the type level. In the meantime, we have a rate-limited log messages to warn us, if such an implementation error sneaks in in the future. fixes https://github.com/neondatabase/neon/issues/3593 fixes https://github.com/neondatabase/neon/issues/3942 --------- Co-authored-by: Joonas Koivunen --- libs/utils/src/lib.rs | 2 + libs/utils/src/rate_limit.rs | 66 +++++++ pageserver/src/tenant/storage_layer.rs | 116 +++++++++--- .../src/tenant/storage_layer/delta_layer.rs | 6 +- .../src/tenant/storage_layer/image_layer.rs | 6 +- .../src/tenant/storage_layer/remote_layer.rs | 21 +- pageserver/src/tenant/timeline.rs | 61 ++++-- .../src/tenant/timeline/eviction_task.rs | 9 +- test_runner/fixtures/neon_fixtures.py | 6 + test_runner/regress/test_metric_collection.py | 7 - .../regress/test_threshold_based_eviction.py | 179 ++++++++++++++++++ 11 files changed, 415 insertions(+), 64 deletions(-) create mode 100644 libs/utils/src/rate_limit.rs create mode 100644 test_runner/regress/test_threshold_based_eviction.py diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 82701ed4b0..0615d50c1a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -58,6 +58,8 @@ pub mod pageserver_feedback; pub mod tracing_span_assert; +pub mod rate_limit; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs new file mode 100644 index 0000000000..557955bb88 --- /dev/null +++ b/libs/utils/src/rate_limit.rs @@ -0,0 +1,66 @@ +//! A helper to rate limit operations. + +use std::time::{Duration, Instant}; + +pub struct RateLimit { + last: Option, + interval: Duration, +} + +impl RateLimit { + pub fn new(interval: Duration) -> Self { + Self { + last: None, + interval, + } + } + + /// Call `f` if the rate limit allows. + /// Don't call it otherwise. + pub fn call(&mut self, f: F) { + let now = Instant::now(); + match self.last { + Some(last) if now - last <= self.interval => { + // ratelimit + } + _ => { + self.last = Some(now); + f(); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicUsize; + + #[test] + fn basics() { + use super::RateLimit; + use std::sync::atomic::Ordering::Relaxed; + use std::time::Duration; + + let called = AtomicUsize::new(0); + let mut f = RateLimit::new(Duration::from_millis(100)); + + let cl = || { + called.fetch_add(1, Relaxed); + }; + + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 3); + } +} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 2ee723e7c3..d30d6c5c6e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -13,9 +13,9 @@ use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; -use either::Either; use enum_map::EnumMap; use enumset::EnumSet; +use once_cell::sync::Lazy; use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, @@ -23,8 +23,10 @@ use pageserver_api::models::{ use std::ops::Range; use std::path::PathBuf; use std::sync::{Arc, Mutex}; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; +use utils::rate_limit::RateLimit; use utils::{ id::{TenantId, TimelineId}, @@ -37,6 +39,8 @@ pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use remote_layer::RemoteLayer; +use super::layer_map::BatchedUpdates; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -158,41 +162,82 @@ impl LayerAccessStatFullDetails { } impl LayerAccessStats { - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + /// Create an empty stats object. + /// + /// The caller is responsible for recording a residence event + /// using [`record_residence_event`] before calling `latest_activity`. + /// If they don't, [`latest_activity`] will return `None`. + pub(crate) fn empty_will_record_residence_event_later() -> Self { + LayerAccessStats(Mutex::default()) } - pub(crate) fn for_new_layer_file() -> Self { + /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn for_loading_layer( + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + status: LayerResidenceStatus, + ) -> Self + where + L: ?Sized + Layer, + { let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, + layer_map_lock_held_witness, + status, + LayerResidenceEventReason::LayerLoad, ); new } /// Creates a clone of `self` and records `new_status` in the clone. - /// The `new_status` is not recorded in `self` - pub(crate) fn clone_for_residence_change( + /// + /// The `new_status` is not recorded in `self`. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn clone_for_residence_change( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, new_status: LayerResidenceStatus, - ) -> LayerAccessStats { + ) -> LayerAccessStats + where + L: ?Sized + Layer, + { let clone = { let inner = self.0.lock().unwrap(); inner.clone() }; let new = LayerAccessStats(Mutex::new(clone)); - new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange); + new.record_residence_event( + layer_map_lock_held_witness, + new_status, + LayerResidenceEventReason::ResidenceChange, + ); new } - fn record_residence_event( + /// Record a change in layer residency. + /// + /// Recording the event must happen while holding the layer map lock to + /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs) + /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`. + /// + /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock, + /// the following race could happen: + /// + /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp. + /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. + /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. + /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. + /// + pub(crate) fn record_residence_event( &self, + _layer_map_lock_held_witness: &BatchedUpdates<'_, L>, status: LayerResidenceStatus, reason: LayerResidenceEventReason, - ) { + ) where + L: ?Sized + Layer, + { let mut locked = self.0.lock().unwrap(); locked.iter_mut().for_each(|inner| { inner @@ -255,24 +300,37 @@ impl LayerAccessStats { ret } - fn most_recent_access_or_residence_event( - &self, - ) -> Either { + /// Get the latest access timestamp, falling back to latest residence event. + /// + /// This function can only return `None` if there has not yet been a call to the + /// [`record_residence_event`] method. That would generally be considered an + /// implementation error. This function logs a rate-limited warning in that case. + /// + /// TODO: use type system to avoid the need for `fallback`. + /// The approach in https://github.com/neondatabase/neon/pull/3775 + /// could be used to enforce that a residence event is recorded + /// before a layer is added to the layer map. We could also have + /// a layer wrapper type that holds the LayerAccessStats, and ensure + /// that that type can only be produced by inserting into the layer map. + pub(crate) fn latest_activity(&self) -> Option { let locked = self.0.lock().unwrap(); let inner = &locked.for_eviction_policy; match inner.last_accesses.recent() { - Some(a) => Either::Left(*a), + Some(a) => Some(a.when), None => match inner.last_residence_changes.recent() { - Some(e) => Either::Right(e.clone()), - None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"), - } - } - } - - pub(crate) fn latest_activity(&self) -> SystemTime { - match self.most_recent_access_or_residence_event() { - Either::Left(mra) => mra.when, - Either::Right(re) => re.timestamp, + Some(e) => Some(e.timestamp), + None => { + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.0 += 1; + let occurences = guard.0; + guard.1.call(move || { + warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); + }); + None + } + }, } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 98cbcc5f07..ba3ab6dd4c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,7 +57,7 @@ use utils::{ use super::{ DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter, - LayerKeyIter, LayerResidenceStatus, PathOrConf, + LayerKeyIter, PathOrConf, }; /// @@ -637,7 +637,7 @@ impl DeltaLayer { key_range: summary.key_range, lsn_range: summary.lsn_range, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -808,7 +808,7 @@ impl DeltaLayerWriterInner { key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index a99b1b491f..d298b3e852 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,7 +53,7 @@ use utils::{ }; use super::filename::{ImageFileName, LayerFileName}; -use super::{Layer, LayerAccessStatsReset, LayerIter, LayerResidenceStatus, PathOrConf}; +use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf}; /// /// Header stored in the beginning of the file @@ -438,7 +438,7 @@ impl ImageLayer { key_range: summary.key_range, lsn: summary.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -598,7 +598,7 @@ impl ImageLayerWriterInner { key_range: self.key_range.clone(), lsn: self.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 2eb7eb0cb6..2106587ab2 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -4,6 +4,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; +use crate::tenant::layer_map::BatchedUpdates; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use anyhow::{bail, Result}; @@ -246,11 +247,15 @@ impl RemoteLayer { } /// Create a Layer struct representing this layer, after it has been downloaded. - pub fn create_downloaded_layer( + pub fn create_downloaded_layer( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, conf: &'static PageServerConf, file_size: u64, - ) -> Arc { + ) -> Arc + where + L: ?Sized + Layer, + { if self.is_delta { let fname = DeltaFileName { key_range: self.key_range.clone(), @@ -262,8 +267,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } else { let fname = ImageFileName { @@ -276,8 +283,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index bc55c2091c..90f2951aef 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -11,7 +11,8 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, - DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, + DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus, + TimelineState, }; use remote_storage::GenericRemoteStorage; use storage_broker::BrokerClientChannel; @@ -1111,7 +1112,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), LayerFileName::Delta(delta_name) => RemoteLayer::new_delta( self.tenant_id, @@ -1120,7 +1121,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), }); @@ -1489,7 +1490,7 @@ impl Timeline { self.tenant_id, &imgfilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1521,7 +1522,7 @@ impl Timeline { self.tenant_id, &deltafilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1657,7 +1658,10 @@ impl Timeline { self.timeline_id, imgfilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); @@ -1682,7 +1686,10 @@ impl Timeline { self.timeline_id, deltafilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); updates.insert_historic(remote_layer); @@ -2729,11 +2736,16 @@ impl Timeline { ])?; // Add it to the layer map - self.layers - .write() - .unwrap() - .batch_update() - .insert_historic(Arc::new(new_delta)); + let l = Arc::new(new_delta); + let mut layers = self.layers.write().unwrap(); + let mut batch_updates = layers.batch_update(); + l.access_stats().record_residence_event( + &batch_updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + batch_updates.insert_historic(l); + batch_updates.flush(); // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2938,7 +2950,13 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l)); + let l = Arc::new(l); + l.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + updates.insert_historic(l); } updates.flush(); drop(layers); @@ -3371,6 +3389,11 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); + x.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); updates.insert_historic(x); } @@ -3881,9 +3904,9 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. - let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); let mut layers = self_clone.layers.write().unwrap(); let mut updates = layers.batch_update(); + let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); @@ -4155,7 +4178,15 @@ impl Timeline { continue; } - let last_activity_ts = l.access_stats().latest_activity(); + let last_activity_ts = l + .access_stats() + .latest_activity() + .unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); resident_layers.push(LocalLayerInfoForDiskUsageEviction { layer: l, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index cf799a8808..58321762ea 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -184,7 +184,14 @@ impl Timeline { if hist_layer.is_remote_layer() { continue; } - let last_activity_ts = hist_layer.access_stats().latest_activity(); + + let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); + let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 79b2e5b290..93fdc60900 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -292,6 +292,12 @@ def port_distributor(worker_base_port: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +@pytest.fixture(scope="session") +def httpserver_listen_address(port_distributor: PortDistributor): + port = port_distributor.get_port() + return ("localhost", port) + + @pytest.fixture(scope="function") def default_broker( port_distributor: PortDistributor, diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index df542fb84a..1231188896 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -24,13 +24,6 @@ from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response - -@pytest.fixture(scope="session") -def httpserver_listen_address(port_distributor: PortDistributor): - port = port_distributor.get_port() - return ("localhost", port) - - # ============================================================================== # Storage metrics tests # ============================================================================== diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py new file mode 100644 index 0000000000..c7083d92be --- /dev/null +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -0,0 +1,179 @@ +import time +from dataclasses import dataclass +from typing import List, Set, Tuple + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + RemoteStorageKind, + last_flush_lsn_upload, +) +from fixtures.pageserver.http import LayerMapInfo +from fixtures.types import TimelineId +from pytest_httpserver import HTTPServer + +# NB: basic config change tests are in test_tenant_conf.py + + +def test_threshold_based_eviction( + request, + httpserver: HTTPServer, + httpserver_listen_address, + pg_bin: PgBin, + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + + # Start with metrics collection enabled, so that the eviction task + # imitates its accesses. We'll use a non-existent endpoint to make it fail. + # The synthetic size calculation will run regardless. + host, port = httpserver_listen_address + neon_env_builder.pageserver_config_override = f""" + metric_collection_interval="1s" + synthetic_size_calculation_interval="2s" + metric_collection_endpoint="http://{host}:{port}/nonexistent" + """ + metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*" + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(metrics_refused_log_line) + + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline + assert isinstance(timeline_id, TimelineId) + + ps_http = env.pageserver.http_client() + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "NoEviction" + } + + eviction_threshold = 5 + eviction_period = 1 + ps_http.set_tenant_config( + tenant_id, + { + "eviction_policy": { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + }, + }, + ) + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + } + + # restart because changing tenant config is not instant + env.pageserver.stop() + env.pageserver.start() + + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + } + + # create a bunch of L1s, only the least of which will need to be resident + compaction_threshold = 3 # create L1 layers quickly + ps_http.patch_tenant_config_client_side( + tenant_id, + inserts={ + # Disable gc and compaction to avoid on-demand downloads from their side. + # The only on-demand downloads should be from the eviction tasks's "imitate access" functions. + "gc_period": "0s", + "compaction_period": "0s", + # low checkpoint_distance so that pgbench creates many layers + "checkpoint_distance": 1024**2, + # Low compaction target size to create many L1's with tight key ranges. + # This is so that the "imitate access" don't download all the layers. + "compaction_target_size": 1 * 1024**2, # all keys into one L1 + # Turn L0's into L1's fast. + "compaction_threshold": compaction_threshold, + # Prevent compaction from collapsing L1 delta layers into image layers. We want many layers here. + "image_creation_threshold": 100, + # Much larger so that synthetic size caluclation worker, which is part of metric collection, + # computes logical size for initdb_lsn every time, instead of some moving lsn as we insert data. + # This makes the set of downloaded layers predictable, + # thereby allowing the residence statuses to stabilize below. + "gc_horizon": 1024**4, + }, + ) + + # create a bunch of layers + with env.endpoints.create_start("main", tenant_id=tenant_id) as pg: + pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()]) + last_flush_lsn_upload(env, pg, tenant_id, timeline_id) + # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint + for sk in env.safekeepers: + sk.stop() + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # wait for evictions and assert that they stabilize + @dataclass + class ByLocalAndRemote: + remote_layers: Set[str] + local_layers: Set[str] + + class MapInfoProjection: + def __init__(self, info: LayerMapInfo): + self.info = info + + def by_local_and_remote(self) -> ByLocalAndRemote: + return ByLocalAndRemote( + remote_layers={ + layer.layer_file_name for layer in self.info.historic_layers if layer.remote + }, + local_layers={ + layer.layer_file_name for layer in self.info.historic_layers if not layer.remote + }, + ) + + def __eq__(self, other): + if not isinstance(other, MapInfoProjection): + return False + return self.by_local_and_remote() == other.by_local_and_remote() + + def __repr__(self) -> str: + out = ["MapInfoProjection:"] + for layer in sorted(self.info.historic_layers, key=lambda layer: layer.layer_file_name): + remote = "R" if layer.remote else "L" + out += [f" {remote} {layer.layer_file_name}"] + return "\n".join(out) + + observation_window = 8 * eviction_threshold + consider_stable_when_no_change_for_seconds = 3 * eviction_threshold + poll_interval = eviction_threshold / 3 + started_waiting_at = time.time() + map_info_changes: List[Tuple[float, MapInfoProjection]] = [] + while time.time() - started_waiting_at < observation_window: + current = ( + time.time(), + MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + ) + last = map_info_changes[-1] if map_info_changes else (0, None) + if last[1] is None or current[1] != last[1]: + map_info_changes.append(current) + log.info("change in layer map\n before: %s\n after: %s", last, current) + else: + stable_for = current[0] - last[0] + log.info("residencies stable for %s", stable_for) + if stable_for > consider_stable_when_no_change_for_seconds: + break + time.sleep(poll_interval) + + log.info("len(map_info_changes)=%s", len(map_info_changes)) + + # TODO: can we be more precise here? E.g., require we're stable _within_ X*threshold, + # instead of what we do here, i.e., stable _for at least_ X*threshold toward the end of the observation window + assert ( + stable_for > consider_stable_when_no_change_for_seconds + ), "layer residencies did not become stable within the observation window" + + post = map_info_changes[-1][1].by_local_and_remote() + assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" + assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" + + assert env.pageserver.log_contains( + metrics_refused_log_line + ), "ensure the metrics collection worker ran" From b627fa71e47c01a63d5bdd7770fdd7c9ed9b1ec5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 4 May 2023 17:41:42 +0300 Subject: [PATCH 15/19] Make read-only replicas explicit in compute spec (#4136) This builds on top of PR #4058, and supersedes #4018 --- Cargo.lock | 2 + compute_tools/src/compute.rs | 62 +++++++++-------------------- compute_tools/src/config.rs | 28 ++++++++----- control_plane/Cargo.toml | 1 + control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/endpoint.rs | 16 +------- libs/compute_api/Cargo.toml | 1 + libs/compute_api/src/spec.rs | 23 ++++++++++- 8 files changed, 66 insertions(+), 69 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9436b591d6..167acf0e69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1032,6 +1032,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "utils", "workspace_hack", ] @@ -1105,6 +1106,7 @@ dependencies = [ "anyhow", "clap 4.2.2", "comfy-table", + "compute_api", "git-version", "nix", "once_cell", diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index b6bc234beb..dca6becd39 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -30,7 +30,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::ComputeSpec; +use compute_api::spec::{ComputeMode, ComputeSpec}; use crate::config; use crate::pg_helpers::*; @@ -249,38 +249,10 @@ impl ComputeNode { /// safekeepers sync, basebackup, etc. #[instrument(skip(self, compute_state))] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { - #[derive(Clone)] - enum Replication { - Primary, - Static { lsn: Lsn }, - HotStandby, - } - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); - let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") { - if let Some(value) = &option.value { - anyhow::ensure!(option.vartype == "bool"); - matches!(value.as_str(), "on" | "yes" | "true") - } else { - false - } - } else { - false - }; - - let replication = if hot_replica { - Replication::HotStandby - } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") { - Replication::Static { - lsn: Lsn::from_str(&lsn)?, - } - } else { - Replication::Primary - }; - // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; @@ -288,8 +260,8 @@ impl ComputeNode { // Syncing safekeepers is only safe with primary nodes: if a primary // is already connected it will be kicked out, so a secondary (standby) // cannot sync safekeepers. - let lsn = match &replication { - Replication::Primary => { + let lsn = match spec.mode { + ComputeMode::Primary => { info!("starting safekeepers syncing"); let lsn = self .sync_safekeepers(pspec.storage_auth_token.clone()) @@ -297,11 +269,11 @@ impl ComputeNode { info!("safekeepers synced at LSN {}", lsn); lsn } - Replication::Static { lsn } => { + ComputeMode::Static(lsn) => { info!("Starting read-only node at static LSN {}", lsn); - *lsn + lsn } - Replication::HotStandby => { + ComputeMode::Replica => { info!("Initializing standby from latest Pageserver LSN"); Lsn(0) } @@ -321,9 +293,9 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; - match &replication { - Replication::Primary | Replication::Static { .. } => {} - Replication::HotStandby => { + match spec.mode { + ComputeMode::Primary | ComputeMode::Static(..) => {} + ComputeMode::Replica => { add_standby_signal(pgdata_path)?; } } @@ -430,11 +402,13 @@ impl ComputeNode { self.pg_reload_conf(&mut client)?; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, self.connstr.as_str(), &mut client)?; - handle_extensions(&spec, &mut client)?; + if spec.mode == ComputeMode::Primary { + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants(&spec, self.connstr.as_str(), &mut client)?; + handle_extensions(&spec, &mut client)?; + } // 'Close' connection drop(client); @@ -467,7 +441,9 @@ impl ComputeNode { let pg = self.start_postgres(spec.storage_auth_token.clone())?; - self.apply_config(&compute_state)?; + if spec.spec.mode == ComputeMode::Primary { + self.apply_config(&compute_state)?; + } let startup_end_time = Utc::now(); { diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d25eb9b2fc..1168f3876a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::ComputeSpec; +use compute_api::spec::{ComputeMode, ComputeSpec}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -34,17 +34,25 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. - let mut postgres_conf = File::create(path)?; + let mut file = File::create(path)?; - write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; - - Ok(()) -} - -// Write Postgres config block wrapped with generated comment section -fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { writeln!(file, "# Managed by compute_ctl: begin")?; - writeln!(file, "{}", buf)?; + + write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; + + match spec.mode { + ComputeMode::Primary => {} + ComputeMode::Static(lsn) => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + writeln!(file, "recovery_target_lsn='{lsn}'")?; + } + ComputeMode::Replica => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + } + } + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ba39747e03..a341ff0263 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -30,4 +30,5 @@ postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 0e0d71b3f1..30880565ab 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -7,8 +7,8 @@ //! use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint::ComputeMode; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5a1f93dc99..6431f66e1c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -22,6 +22,8 @@ use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +use compute_api::spec::ComputeMode; + // contents of a endpoint.json file #[serde_as] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -122,26 +124,12 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// -#[serde_as] -#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] -pub enum ComputeMode { - // Regular read-write node - Primary, - // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN - Static(#[serde_as(as = "DisplayFromStr")] Lsn), - // Hot standby; read-only replica. - // Future versions may want to distinguish between replicas with hot standby - // feedback and other kinds of replication configurations. - Replica, -} - #[derive(Debug)] pub struct Endpoint { /// used as the directory name name: String, pub tenant_id: TenantId, pub timeline_id: TimelineId, - // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. pub mode: ComputeMode, // port and address of the Postgres server diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 533a091207..428d031a93 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -11,4 +11,5 @@ serde.workspace = true serde_with.workspace = true serde_json.workspace = true +utils = { path = "../utils" } workspace_hack.workspace = true diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index f771910329..87c0edd687 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -3,8 +3,10 @@ //! The spec.json file is used to pass information to 'compute_ctl'. It contains //! all the information needed to start up the right version of PostgreSQL, //! and connect it to the storage nodes. -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; +use utils::lsn::Lsn; /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. @@ -12,6 +14,7 @@ pub type PgIdent = String; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. +#[serde_as] #[derive(Clone, Debug, Default, Deserialize)] pub struct ComputeSpec { pub format_version: f32, @@ -24,11 +27,29 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + #[serde(default)] + pub mode: ComputeMode, + pub storage_auth_token: Option, pub startup_tracing_context: Option>, } +#[serde_as] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeMode { + /// A read-write node + #[default] + Primary, + /// A read-only node, pinned at a particular LSN + Static(#[serde_as(as = "DisplayFromStr")] Lsn), + /// A read-only node that follows the tip of the branch in hot standby mode + /// + /// Future versions may want to distinguish between replicas with hot standby + /// feedback and other kinds of replication configurations. + Replica, +} + #[derive(Clone, Debug, Default, Deserialize)] pub struct Cluster { pub cluster_id: String, From 88f39c11d47ae28d44d7dcea4995a5bdedea6512 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 4 May 2023 17:10:40 +0200 Subject: [PATCH 16/19] refactor: the code that builds TenantConfOpt from mgmt API requests (#4152) - extract code that builds TenantConfOpt from requests into a From<> impl - move map_err(ApiError::BadRequest) into callers --- pageserver/src/http/routes.rs | 176 +------------------------------- pageserver/src/tenant/config.rs | 175 +++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 174 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b1251123b2..58ee7367ca 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -691,16 +691,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -// Helper function to standardize the error messages we produce on bad durations -// -// Intended to be used with anyhow's `with_context`, e.g.: -// -// let value = result.with_context(bad_duration("name", &value))?; -// -fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { - move || format!("Cannot parse `{field_name}` duration {value:?}") -} - async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -708,91 +698,7 @@ async fn tenant_create_handler(mut request: Request) -> Result(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + +impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &TenantCreateRequest) -> Result { + let mut tenant_conf = TenantConfOpt::default(); + + if let Some(gc_period) = &request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(gc_period) + .with_context(bad_duration("gc_period", gc_period))?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = &request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(pitr_interval) + .with_context(bad_duration("pitr_interval", pitr_interval))?, + ); + } + + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(walreceiver_connect_timeout).with_context( + bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), + )?, + ); + } + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, + ); + } + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + if let Some(trace_read_requests) = request_data.trace_read_requests { + tenant_conf.trace_read_requests = Some(trace_read_requests); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, + ); + } + + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = &request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(compaction_period) + .with_context(bad_duration("compaction_period", compaction_period))?, + ); + } + + if let Some(eviction_policy) = &request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde::Deserialize::deserialize(eviction_policy) + .context("parse field `eviction_policy`")?, + ); + } + + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + + if let Some(evictions_low_residence_duration_metric_threshold) = + &request_data.evictions_low_residence_duration_metric_threshold + { + tenant_conf.evictions_low_residence_duration_metric_threshold = Some( + humantime::parse_duration(evictions_low_residence_duration_metric_threshold) + .with_context(bad_duration( + "evictions_low_residence_duration_metric_threshold", + evictions_low_residence_duration_metric_threshold, + ))?, + ); + } + + Ok(tenant_conf) + } +} + +impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &TenantConfigRequest) -> Result { + let mut tenant_conf = TenantConfOpt::default(); + if let Some(gc_period) = &request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(gc_period) + .with_context(bad_duration("gc_period", gc_period))?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = &request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(pitr_interval) + .with_context(bad_duration("pitr_interval", pitr_interval))?, + ); + } + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(walreceiver_connect_timeout).with_context( + bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), + )?, + ); + } + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, + ); + } + tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag; + tenant_conf.trace_read_requests = request_data.trace_read_requests; + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, + ); + } + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = &request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(compaction_period) + .with_context(bad_duration("compaction_period", compaction_period))?, + ); + } + + if let Some(eviction_policy) = &request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde::Deserialize::deserialize(eviction_policy) + .context("parse field `eviction_policy`")?, + ); + } + + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + + if let Some(evictions_low_residence_duration_metric_threshold) = + &request_data.evictions_low_residence_duration_metric_threshold + { + tenant_conf.evictions_low_residence_duration_metric_threshold = Some( + humantime::parse_duration(evictions_low_residence_duration_metric_threshold) + .with_context(bad_duration( + "evictions_low_residence_duration_metric_threshold", + evictions_low_residence_duration_metric_threshold, + ))?, + ); + } + + Ok(tenant_conf) + } +} + #[cfg(test)] mod tests { use super::*; From 291b4f0d41603c8d0ef974c81ccce8a3f1eb2589 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 4 May 2023 18:22:04 +0100 Subject: [PATCH 17/19] Update client libs for test_runner/pg_clients to their latest versions (#4092) Also, use Workaround D for `swift/PostgresClientKitExample`, which supports neither SNI nor connections options --- .../csharp/npgsql/csharp-npgsql.csproj | 2 +- test_runner/pg_clients/java/jdbc/.gitignore | 1 - test_runner/pg_clients/java/jdbc/Dockerfile | 4 +- .../pg_clients/python/pg8000/requirements.txt | 1 + .../pg_clients/rust/tokio-postgres/Cargo.lock | 323 +++++++++++------- .../pg_clients/rust/tokio-postgres/Cargo.toml | 4 +- .../pg_clients/rust/tokio-postgres/Dockerfile | 2 +- .../swift/PostgresClientKitExample/Dockerfile | 4 +- .../PostgresClientKitExample/Package.swift | 2 +- .../PostgresClientKitExample/main.swift | 29 +- .../swift/PostgresNIOExample/Dockerfile | 4 +- .../swift/PostgresNIOExample/Package.resolved | 20 +- .../swift/PostgresNIOExample/Package.swift | 4 +- .../Sources/PostgresNIOExample/main.swift | 16 +- test_runner/pg_clients/test_pg_clients.py | 7 +- .../typescript/postgresql-client/Dockerfile | 2 +- .../postgresql-client/package-lock.json | 38 +-- .../typescript/postgresql-client/package.json | 2 +- .../typescript/serverless-driver/index.js | 5 +- .../serverless-driver/package-lock.json | 112 +++++- .../typescript/serverless-driver/package.json | 4 +- 21 files changed, 375 insertions(+), 211 deletions(-) delete mode 100644 test_runner/pg_clients/java/jdbc/.gitignore diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 91181943d5..bb4427f2c4 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore deleted file mode 100644 index 8b13789179..0000000000 --- a/test_runner/pg_clients/java/jdbc/.gitignore +++ /dev/null @@ -1 +0,0 @@ - diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 0b7d03e636..74eb9bdc32 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:17 +FROM openjdk:20 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.5.1.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index 3dbb98d6a1..7bba8da06d 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1 +1,2 @@ pg8000==1.29.4 +scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index a0067b183e..30deb3ff20 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "async-trait" -version = "0.1.65" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "095183a3539c7c7649b2beb87c2d3f0591f3a7fed07761cc546d244e27e0238c" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", @@ -21,9 +21,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.13.1" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" [[package]] name = "bitflags" @@ -33,9 +33,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] @@ -76,15 +76,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" dependencies = [ "libc", ] @@ -112,13 +112,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -163,9 +163,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -188,15 +188,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -205,15 +205,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", @@ -222,21 +222,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -252,9 +252,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -262,15 +262,21 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", "wasi", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hmac" version = "0.12.1" @@ -291,12 +297,13 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.5" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ + "hermit-abi", "libc", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -307,15 +314,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "36eb31c1778188ae1e64398743890d0877fef36d11521ac60406b42016e8c2cf" [[package]] name = "lock_api" @@ -389,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "openssl" -version = "0.10.48" +version = "0.10.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "518915b97df115dd36109bfa429a48b8f737bd05508cf9588977b599648926d2" +checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" dependencies = [ "bitflags", "cfg-if", @@ -404,9 +411,9 @@ dependencies = [ [[package]] name = "openssl-macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", @@ -421,11 +428,10 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.83" +version = "0.9.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "666416d899cf077260dac8698d60a60b435a46d57e82acb1be3d0dad87284e5b" +checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" dependencies = [ - "autocfg", "cc", "libc", "pkg-config", @@ -450,7 +456,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "windows-sys 0.45.0", ] @@ -512,9 +518,9 @@ dependencies = [ [[package]] name = "postgres-protocol" -version = "0.6.4" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "878c6cbf956e03af9aa8204b407b9cbf47c072164800aa918c516cd4b056c50c" +checksum = "78b7fa9f396f51dffd61546fd8573ee20592287996568e6175ceb0f8699ad75d" dependencies = [ "base64", "byteorder", @@ -530,9 +536,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73d946ec7d256b04dfadc4e6a3292324e6f417124750fc5c0950f981b703a0f1" +checksum = "f028f05971fe20f512bcc679e2c10227e57809a3af86a7606304435bc8896cd6" dependencies = [ "bytes", "fallible-iterator", @@ -547,18 +553,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -602,6 +608,15 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "rust-neon-example" version = "0.1.0" @@ -614,16 +629,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.9" +version = "0.37.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +checksum = "a0661814f891c57c930a610266415528da53c4933e6dea5fb350cbfe048a9ece" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -706,6 +721,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "socket2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -724,9 +749,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.109" +version = "2.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" dependencies = [ "proc-macro2", "quote", @@ -735,15 +760,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -763,26 +788,25 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.26.0" +version = "1.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64" +checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "pin-project-lite", - "socket2", + "socket2 0.4.9", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", @@ -801,9 +825,9 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29a12c1b3e0704ae7dfc25562629798b29c72e6b1d0a681b6f29ab4ae5e7f7bf" +checksum = "6e89f6234aa8fd43779746012fcf53603cdb91fdd8399aa0de868c2d56b6dde1" dependencies = [ "async-trait", "byteorder", @@ -818,16 +842,16 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2", + "socket2 0.5.2", "tokio", "tokio-util", ] [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" dependencies = [ "bytes", "futures-core", @@ -839,11 +863,10 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "cf9cf6a813d3f40c88b0b6b6f29a5c95c6cdbf97c1f9cc53fb820200f5ad814d" dependencies = [ - "cfg-if", "pin-project-lite", "tracing-core", ] @@ -865,15 +888,15 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "775c11906edafc97bc378816b94585fbd9a054eabaf86fdd0ced94af449efab7" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" @@ -930,13 +953,13 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -945,62 +968,128 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index f1b519bdb2..4675ac8a3f 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -9,8 +9,8 @@ publish = false [dependencies] native-tls = "0.2.11" postgres-native-tls = "0.5.0" -tokio = { version = "1.26", features=["rt", "macros"] } -tokio-postgres = "0.7.7" +tokio = { version = "1.28", features=["rt", "macros"] } +tokio-postgres = "0.7.8" # This is not part of the main 'neon' workspace diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index c3a15b5d85..43fc6f6c92 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.67 +FROM rust:1.69 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index f6a49a2892..9538cf4ed4 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.7 AS build +FROM swift:5.8 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.7 +FROM swift:5.8 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index ba666cba06..48320dd023 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.7 +// swift-tools-version:5.8 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift index e559e9c184..dabca6d431 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift @@ -3,21 +3,22 @@ import Foundation import PostgresClientKit do { - var configuration = PostgresClientKit.ConnectionConfiguration() - let env = ProcessInfo.processInfo.environment - if let host = env["NEON_HOST"] { - configuration.host = host - } - if let database = env["NEON_DATABASE"] { - configuration.database = database - } - if let user = env["NEON_USER"] { - configuration.user = user - } - if let password = env["NEON_PASSWORD"] { - configuration.credential = .cleartextPassword(password: password) - } + + var configuration = PostgresClientKit.ConnectionConfiguration() + let host = env["NEON_HOST"] ?? "" + configuration.host = host + configuration.port = 5432 + configuration.database = env["NEON_DATABASE"] ?? "" + configuration.user = env["NEON_USER"] ?? "" + + // PostgresClientKit uses Kitura/BlueSSLService which doesn't support SNI + // PostgresClientKit doesn't support setting connection options, so we use "Workaround D" + // See https://neon.tech/sni + let password = env["NEON_PASSWORD"] ?? "" + let endpoint_id = host.split(separator: ".")[0] + let workaroundD = "project=\(endpoint_id);\(password)" + configuration.credential = .cleartextPassword(password: workaroundD) let connection = try PostgresClientKit.Connection(configuration: configuration) defer { connection.close() } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 629422d220..61e1d1bba6 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.7 AS build +FROM swift:5.8 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.7 +FROM swift:5.8 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 8246567b24..cc12acda4c 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "7daf026e145de2c07d6e37f4171b1acb4b5f22b1", - "version" : "1.12.1" + "revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937", + "version" : "1.14.1" } }, { @@ -14,8 +14,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "ff3d2212b6b093db7f177d0855adbc4ef9c5f036", - "version" : "1.0.3" + "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", + "version" : "1.1.0" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "45167b8006448c79dda4b7bd604e07a034c15c49", - "version" : "2.48.0" + "revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75", + "version" : "2.51.1" } }, { @@ -68,8 +68,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "4fb7ead803e38949eb1d6fabb849206a72c580f3", - "version" : "2.23.0" + "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", + "version" : "2.24.0" } }, { @@ -77,8 +77,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "c0d9a144cfaec8d3d596aadde3039286a266c15c", - "version" : "1.15.0" + "revision" : "41f4098903878418537020075a4d8a6e20a0b182", + "version" : "1.17.0" } } ], diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index c64013b9ee..ac32b982e2 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.7 +// swift-tools-version:5.8 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.8.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift index 092a0b31f3..ca7e22dab1 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift @@ -14,15 +14,11 @@ await Task { let sslContext = try! NIOSSLContext(configuration: .makeClientConfiguration()) let config = PostgresConnection.Configuration( - connection: .init( - host: env["NEON_HOST"] ?? "", - port: 5432 - ), - authentication: .init( - username: env["NEON_USER"] ?? "", - database: env["NEON_DATABASE"] ?? "", - password: env["NEON_PASSWORD"] ?? "" - ), + host: env["NEON_HOST"] ?? "", + port: 5432, + username: env["NEON_USER"] ?? "", + password: env["NEON_PASSWORD"] ?? "", + database: env["NEON_DATABASE"] ?? "", tls: .require(sslContext) ) @@ -42,7 +38,7 @@ await Task { try await connection.close() // Shutdown the EventLoopGroup, once all connections are closed. - try eventLoopGroup.syncShutdownGracefully() + try await eventLoopGroup.shutdownGracefully() } catch { print(error) } diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 18e3f379da..7c20bac399 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -16,10 +16,9 @@ from fixtures.utils import subprocess_capture "rust/tokio-postgres", "python/asyncpg", "python/pg8000", - pytest.param( - "swift/PostgresClientKitExample", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 - marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"), - ), + # PostgresClientKitExample does not support SNI or connection options, so it uses workaround D (https://neon.tech/sni) + # See example itself: test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift + "swift/PostgresClientKitExample", "swift/PostgresNIOExample", "typescript/postgresql-client", "typescript/serverless-driver", diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index a5ad832a5c..07e98c586b 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:18 +FROM node:20 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 5586fe883e..e4dfd1dd9d 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.2" + "postgresql-client": "2.5.5" } }, "node_modules/debug": { @@ -63,18 +63,18 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.2.tgz", - "integrity": "sha512-BaVDEmPlZllcmXBbw48194a6sB1YEe+ACX8c3SfgpGeW9+xJ5vShQ/ruNZtI+nuPW95BjL1WQGaxy+SfxSQgUQ==", + "version": "2.5.5", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz", + "integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==", "dependencies": { "debug": "^4.3.4", - "doublylinked": "^2.5.1", - "lightning-pool": "^4.2.0", + "doublylinked": "^2.5.2", + "lightning-pool": "^4.2.1", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.5.0", - "putil-merge": "^3.9.0", - "putil-promisify": "^1.8.5", - "putil-varhelpers": "^1.6.4" + "power-tasks": "^1.6.4", + "putil-merge": "^3.10.3", + "putil-promisify": "^1.10.0", + "putil-varhelpers": "^1.6.5" }, "engines": { "node": ">=14.0", @@ -82,9 +82,9 @@ } }, "node_modules/power-tasks": { - "version": "1.6.3", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.3.tgz", - "integrity": "sha512-nBqzjbiCxvftEKsJtbEz5ZVKVl6RdwA5I7Ts3Z7DCe3lkvFsv9d8J4qp+b9GbdddsfV1KyIPSqPyLWq2YJQh6g==", + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz", + "integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==", "dependencies": { "debug": "^4.3.4", "doublylinked": "^2.5.2", @@ -96,9 +96,9 @@ } }, "node_modules/putil-merge": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.1.tgz", - "integrity": "sha512-t3cLn14qccFvmb4bYQfNEHoisab//bTjM3lp56Ks8rOsjWF2ssf7Vapg9Lt89GlEawyNdeu+xj5GSrsFqNoCDQ==", + "version": "3.10.3", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", + "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", "engines": { "node": ">= 10.0" } @@ -112,9 +112,9 @@ } }, "node_modules/putil-varhelpers": { - "version": "1.6.4", - "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", - "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==", + "version": "1.6.5", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.5.tgz", + "integrity": "sha512-kyu+lE5xkc65ScgaIi6rNONLXeS7jGBxl1I0rzHVsFGAAQ45D/VkuEev+t09PFB943F+CqdWFLczH6ePk5TPAA==", "engines": { "node": ">= 6.0" } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 80540dec22..9eaa13437a 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.2" + "postgresql-client": "2.5.5" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/index.js b/test_runner/pg_clients/typescript/serverless-driver/index.js index 91db4037e1..ad9a4c3000 100755 --- a/test_runner/pg_clients/typescript/serverless-driver/index.js +++ b/test_runner/pg_clients/typescript/serverless-driver/index.js @@ -1,8 +1,11 @@ #! /usr/bin/env node -import { Client } from '@neondatabase/serverless' +import ws from 'ws' +import { neonConfig, Client } from '@neondatabase/serverless' (async () => { + neonConfig.webSocketConstructor = ws + const client = new Client({ host: process.env.NEON_HOST, database: process.env.NEON_DATABASE, diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index c4d03bffcb..0fb84cf5b7 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -1,18 +1,99 @@ { "name": "serverless-driver", - "lockfileVersion": 2, + "lockfileVersion": 3, "requires": true, "packages": { "": { "dependencies": { - "@neondatabase/serverless": "^0.2.8", - "ws": "^8.13.0" + "@neondatabase/serverless": "0.4.3", + "ws": "8.13.0" } }, "node_modules/@neondatabase/serverless": { - "version": "0.2.8", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", - "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.3.tgz", + "integrity": "sha512-U8tpuF5f0R5WRsciR7iaJ5S2h54DWa6Z6CEW+J4KgwyvRN3q3qDz0MibdfFXU0WqnRoi/9RSf/2XN4TfeaOCbQ==", + "dependencies": { + "@types/pg": "^8.6.6" + } + }, + "node_modules/@types/node": { + "version": "18.16.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", + "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + }, + "node_modules/@types/pg": { + "version": "8.6.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", + "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "dependencies": { + "@types/node": "*", + "pg-protocol": "*", + "pg-types": "^2.2.0" + } + }, + "node_modules/pg-int8": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", + "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/pg-protocol": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", + "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" + }, + "node_modules/pg-types": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", + "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "dependencies": { + "pg-int8": "1.0.1", + "postgres-array": "~2.0.0", + "postgres-bytea": "~1.0.0", + "postgres-date": "~1.0.4", + "postgres-interval": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-array": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", + "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-bytea": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", + "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-date": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", + "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-interval": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", + "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", + "dependencies": { + "xtend": "^4.0.0" + }, + "engines": { + "node": ">=0.10.0" + } }, "node_modules/ws": { "version": "8.13.0", @@ -33,19 +114,14 @@ "optional": true } } - } - }, - "dependencies": { - "@neondatabase/serverless": { - "version": "0.2.8", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", - "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" }, - "ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", - "requires": {} + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "engines": { + "node": ">=0.4" + } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 2b01a22d3e..71ba181afc 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "^0.2.8", - "ws": "^8.13.0" + "@neondatabase/serverless": "0.4.3", + "ws": "8.13.0" } } From 653e633c597b7c855e70f4397ac609af4f7897ac Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 5 May 2023 00:57:47 +0100 Subject: [PATCH 18/19] test_runner: add --pg-version pytest argument (#4037) - allows setting Postgres version for testing using --pg-version argument - fixes tests for the non-default Postgres version. --- test_runner/README.md | 3 +- test_runner/conftest.py | 1 + test_runner/fixtures/neon_fixtures.py | 37 +++++------- test_runner/fixtures/pageserver/http.py | 19 +++--- test_runner/fixtures/pg_version.py | 68 ++++++++++++++++++++++ test_runner/regress/test_auth.py | 12 +++- test_runner/regress/test_compatibility.py | 11 ++-- test_runner/regress/test_pageserver_api.py | 13 +++-- test_runner/regress/test_timeline_size.py | 3 +- test_runner/regress/test_wal_acceptor.py | 7 ++- 10 files changed, 129 insertions(+), 45 deletions(-) create mode 100644 test_runner/fixtures/pg_version.py diff --git a/test_runner/README.md b/test_runner/README.md index 877498bae7..96e74659ce 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -71,7 +71,8 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. -Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively, +you can use `--pg-version` argument. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 75242b84ce..4640861936 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,4 +1,5 @@ pytest_plugins = ( + "fixtures.pg_version", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 93fdc60900..1a480e1b04 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -45,6 +45,7 @@ from typing_extensions import Literal from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -75,7 +76,6 @@ Env = Dict[str, str] DEFAULT_OUTPUT_DIR: str = "test_output" DEFAULT_BRANCH_NAME: str = "main" -DEFAULT_PG_VERSION_DEFAULT: str = "14" BASE_PORT: int = 15000 WORKER_PORT_NUM: int = 1000 @@ -148,18 +148,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def pg_version() -> Iterator[str]: - if env_default_pg_version := os.environ.get("DEFAULT_PG_VERSION"): - version = env_default_pg_version - else: - version = DEFAULT_PG_VERSION_DEFAULT - - log.info(f"pg_version is {version}") - yield version - - -@pytest.fixture(scope="session") -def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[Path]: +def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: versioned_dir = pg_distrib_dir / f"v{pg_version}" psql_bin_path = versioned_dir / "bin/psql" @@ -592,7 +581,7 @@ class NeonEnvBuilder: mock_s3_server: MockS3Server, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, remote_storage: Optional[RemoteStorage] = None, remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, @@ -1041,7 +1030,7 @@ def _shared_simple_env( top_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1098,7 +1087,7 @@ def neon_env_builder( mock_s3_server: MockS3Server, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, default_broker: NeonBroker, run_id: uuid.UUID, ) -> Iterator[NeonEnvBuilder]: @@ -1753,7 +1742,7 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: str): + def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion): self.log_dir = log_dir self.pg_version = pg_version self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin" @@ -1812,7 +1801,7 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBin: +def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -1900,7 +1889,7 @@ def vanilla_pg( test_output_dir: Path, port_distributor: PortDistributor, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -1945,7 +1934,7 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") def remote_pg( - test_output_dir: Path, pg_distrib_dir: Path, pg_version: str + test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion ) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -2629,7 +2618,7 @@ class Safekeeper: @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int - pg_version: int + pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 flush_lsn: Lsn commit_lsn: Lsn timeline_start_lsn: Lsn @@ -2675,7 +2664,11 @@ class SafekeeperHttpClient(requests.Session): return res_json def timeline_create( - self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + commit_lsn: Lsn, ): body = { "tenant_id": str(tenant_id), diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index cf92aeb6c0..0c4ed60c8d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -9,6 +9,7 @@ import requests from fixtures.log_helper import log from fixtures.metrics import Metrics, parse_metrics +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import Fn @@ -266,19 +267,21 @@ class PageserverHttpClient(requests.Session): def timeline_create( self, + pg_version: PgVersion, tenant_id: TenantId, new_timeline_id: Optional[TimelineId] = None, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", - json={ - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, - "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, - "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, - }, - ) + body: Dict[str, Any] = { + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + } + if pg_version != PgVersion.NOT_SET: + body["pg_version"] = int(pg_version) + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body) self.verbose_error(res) if res.status_code == 409: raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py new file mode 100644 index 0000000000..904a274197 --- /dev/null +++ b/test_runner/fixtures/pg_version.py @@ -0,0 +1,68 @@ +import enum +import os +from typing import Iterator, Optional + +import pytest +from _pytest.config.argparsing import Parser +from pytest import FixtureRequest + +from fixtures.log_helper import log + +""" +This fixture is used to determine which version of Postgres to use for tests. +""" + + +# Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument +# TODO: use enum.StrEnum for Python >= 3.11 +@enum.unique +class PgVersion(str, enum.Enum): + V14 = "14" + V15 = "15" + # Instead of making version an optional parameter in methods, we can use this fake entry + # to explicitly rely on the default server version (could be different from pg_version fixture value) + NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" + + # Make it less confusing in logs + def __repr__(self) -> str: + return f"'{self.value}'" + + @classmethod + def _missing_(cls, value) -> Optional["PgVersion"]: + known_values = {v.value for _, v in cls.__members__.items()} + + # Allow passing version as a string with "v" prefix (e.g. "v14") + if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values: + return cls(value[1:]) + # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15) + elif isinstance(value, int) and str(value)[:2] in known_values: + return cls(str(value)[:2]) + + # Make mypy happy + # See https://github.com/python/mypy/issues/3974 + return None + + +DEFAULT_VERSION: PgVersion = PgVersion.V14 + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--pg-version", + action="store", + type=PgVersion, + help="Postgres version to use for tests", + ) + + +@pytest.fixture(scope="session") +def pg_version(request: FixtureRequest) -> Iterator[PgVersion]: + if v := request.config.getoption("--pg-version"): + version, source = v, "from --pg-version commad-line argument" + elif v := os.environ.get("DEFAULT_PG_VERSION"): + version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable" + else: + version, source = DEFAULT_VERSION, "default verson" + + log.info(f"pg_version is {version} ({source})") + yield version diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 3305869dce..3e4a0bfbbb 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -31,11 +31,15 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): # tenant can create branches tenant_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # console can create branches for tenant pageserver_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # fail to create branch using token with different tenant_id @@ -43,7 +47,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" ): invalid_tenant_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # create tenant using management token diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e262202a73..ff61af8fd3 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -16,6 +16,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn from pytest import FixtureRequest @@ -50,7 +51,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. # # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. - neon_env_builder.pg_version = "14" + neon_env_builder.pg_version = PgVersion.V14 neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_local_fs_remote_storage() neon_env_builder.preserve_database_files = True @@ -98,7 +99,7 @@ def test_backward_compatibility( test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, request: FixtureRequest, ): """ @@ -153,7 +154,7 @@ def test_backward_compatibility( def test_forward_compatibility( test_output_dir: Path, port_distributor: PortDistributor, - pg_version: str, + pg_version: PgVersion, request: FixtureRequest, neon_binpath: Path, ): @@ -345,7 +346,7 @@ def check_neon_works( neon_target_binpath: Path, neon_current_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, port_distributor: PortDistributor, test_output_dir: Path, pg_bin: PgBin, @@ -404,7 +405,7 @@ def check_neon_works( shutil.rmtree(repo_dir / "local_fs_remote_storage") pageserver_http.timeline_delete(tenant_id, timeline_id) - pageserver_http.timeline_create(tenant_id, timeline_id) + pageserver_http.timeline_create(pg_version, tenant_id, timeline_id) pg_bin.run( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index e86cd18f58..28732872df 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -8,6 +8,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until @@ -61,7 +62,7 @@ def test_pageserver_init_node_id( assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: PageserverHttpClient, initial_tenant: TenantId): +def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there @@ -77,7 +78,11 @@ def check_client(client: PageserverHttpClient, initial_tenant: TenantId): # create timeline timeline_id = TimelineId.generate() - client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + client.timeline_create( + pg_version=pg_version, + tenant_id=tenant_id, + new_timeline_id=timeline_id, + ) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 @@ -174,7 +179,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): def test_pageserver_http_api_client(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: - check_client(client, env.initial_tenant) + check_client(env.pg_version, client, env.initial_tenant) def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): @@ -184,4 +189,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde pageserver_token = env.auth_keys.generate_pageserver_token() with env.pageserver.http_client(auth_token=pageserver_token) as client: - check_client(client, env.initial_tenant) + check_client(env.pg_version, client, env.initial_tenant) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index db278d5646..1460172afe 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -27,6 +27,7 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, wait_until_tenant_active, ) +from fixtures.pg_version import PgVersion from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until @@ -489,7 +490,7 @@ def test_timeline_size_metrics( test_output_dir: Path, port_distributor: PortDistributor, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ): env = neon_simple_env pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index fed5f325ca..2a4141ed30 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -32,6 +32,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background @@ -582,7 +583,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re shutil.copy(f_partial_saved, f_partial_path) # recreate timeline on pageserver from scratch - ps_cli.timeline_create(tenant_id, timeline_id) + ps_cli.timeline_create( + pg_version=PgVersion(pg_version), + tenant_id=tenant_id, + new_timeline_id=timeline_id, + ) wait_lsn_timeout = 60 * 3 started_at = time.time() From dd4fd89dc698dfc039abdbc26224666492e5811a Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 5 May 2023 11:45:37 +0200 Subject: [PATCH 19/19] [compute_ctl] Do not initialize `last_active` on start (#4137) Our scale-to-zero logic was optimized for short auto-suspend intervals, e.g. minutes or hours. In this case, if compute was restarted by k8s due to some reason (OOM, k8s node went down, pod relocation, etc.), `last_active` got bumped, we start counting auto-suspend timeout again. It's not a big deal, i.e. we suspend completely idle compute not after 5 minutes, but after 10 minutes or so. Yet, some clients may want days or even weeks. And chance that compute could be restarted during this interval is pretty high, but in this case we could be not able to suspend some computes for weeks. After this commit, we won't initialize `last_active` on start, so `/status` could return an unset attribute. This means that there was no user activity since start. Control-plane should deal with it by taking `max()` out of all available activity timestamps: `started_at`, `last_active`, etc. compute_ctl part of neondatabase/cloud#4853 --- compute_tools/src/compute.rs | 7 ++++--- compute_tools/src/http/openapi_spec.yaml | 11 ++++++++--- compute_tools/src/monitor.rs | 7 ++++--- libs/compute_api/src/responses.rs | 12 ++++++++---- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index dca6becd39..da5ad00da6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -67,8 +67,9 @@ pub struct ComputeNode { pub struct ComputeState { pub start_time: DateTime, pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - pub last_active: DateTime, + /// Timestamp of the last Postgres activity. It could be `None` if + /// compute wasn't used since start. + pub last_active: Option>, pub error: Option, pub pspec: Option, pub metrics: ComputeMetrics, @@ -79,7 +80,7 @@ impl ComputeState { Self { start_time: Utc::now(), status: ComputeStatus::Empty, - last_active: Utc::now(), + last_active: None, error: None, pspec: None, metrics: ComputeMetrics::default(), diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index cc8f074a50..2680269756 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -181,8 +181,8 @@ components: ComputeState: type: object required: + - start_time - status - - last_active properties: start_time: type: string @@ -195,11 +195,13 @@ components: $ref: '#/components/schemas/ComputeStatus' last_active: type: string - description: The last detected compute activity timestamp in UTC and RFC3339 format. + description: | + The last detected compute activity timestamp in UTC and RFC3339 format. + It could be empty if compute was never used by user since start. example: "2022-10-12T07:20:50.52Z" error: type: string - description: Text of the error during compute startup, if any. + description: Text of the error during compute startup or reconfiguration, if any. example: "" tenant: type: string @@ -222,9 +224,12 @@ components: ComputeStatus: type: string enum: + - empty - init - failed - running + - configuration_pending + - configuration example: running # diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index a30b52aed4..d2e7b698dd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -74,7 +74,7 @@ fn watch_compute_activity(compute: &ComputeNode) { // Found non-idle backend, so the last activity is NOW. // Save it and exit the for loop. Also clear the idle backend // `state_change` timestamps array as it doesn't matter now. - last_active = Utc::now(); + last_active = Some(Utc::now()); idle_backs.clear(); break; } @@ -82,15 +82,16 @@ fn watch_compute_activity(compute: &ComputeNode) { // Get idle backend `state_change` with the max timestamp. if let Some(last) = idle_backs.iter().max() { - last_active = *last; + last_active = Some(*last); } } // Update the last activity in the shared state if we got a more recent one. let mut state = compute.state.lock().unwrap(); + // NB: `Some()` is always greater than `None`. if last_active > state.last_active { state.last_active = last_active; - debug!("set the last compute activity time to: {}", last_active); + debug!("set the last compute activity time to: {:?}", last_active); } } Err(e) => { diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index c409563b56..d181c018b1 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -19,7 +19,7 @@ pub struct ComputeStatusResponse { pub timeline: Option, pub status: ComputeStatus, #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, + pub last_active: Option>, pub error: Option, } @@ -29,7 +29,7 @@ pub struct ComputeState { pub status: ComputeStatus, /// Timestamp of the last Postgres activity #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, + pub last_active: Option>, pub error: Option, } @@ -54,11 +54,15 @@ pub enum ComputeStatus { Failed, } -fn rfc3339_serialize(x: &DateTime, s: S) -> Result +fn rfc3339_serialize(x: &Option>, s: S) -> Result where S: Serializer, { - x.to_rfc3339().serialize(s) + if let Some(x) = x { + x.to_rfc3339().serialize(s) + } else { + s.serialize_none() + } } /// Response of the /metrics.json API