diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e5ba7aa3eb..90f0395c7c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -418,10 +418,7 @@ jobs: - uses: actions/github-script@v6 if: > !cancelled() && - github.event_name == 'pull_request' && ( - steps.create-allure-report-debug.outputs.report-url || - steps.create-allure-report-release.outputs.report-url - ) + github.event_name == 'pull_request' with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index f37e1b344d..6e77a3b571 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -274,7 +274,7 @@ jobs: - name: Deploy pg-sni-router run: - helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s - name: Cleanup helm folder run: rm -rf ~/.cache diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index c5d690db3a..baa44d8094 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -214,4 +214,4 @@ jobs: - name: Deploy pg-sni-router run: - helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s + helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s diff --git a/Cargo.lock b/Cargo.lock index bce2d11188..167acf0e69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1032,6 +1032,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "utils", "workspace_hack", ] @@ -1105,6 +1106,7 @@ dependencies = [ "anyhow", "clap 4.2.2", "comfy-table", + "compute_api", "git-version", "nix", "once_cell", @@ -3674,6 +3676,7 @@ dependencies = [ "remote_storage", "reqwest", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index b6bc234beb..da5ad00da6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -30,7 +30,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::ComputeSpec; +use compute_api::spec::{ComputeMode, ComputeSpec}; use crate::config; use crate::pg_helpers::*; @@ -67,8 +67,9 @@ pub struct ComputeNode { pub struct ComputeState { pub start_time: DateTime, pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - pub last_active: DateTime, + /// Timestamp of the last Postgres activity. It could be `None` if + /// compute wasn't used since start. + pub last_active: Option>, pub error: Option, pub pspec: Option, pub metrics: ComputeMetrics, @@ -79,7 +80,7 @@ impl ComputeState { Self { start_time: Utc::now(), status: ComputeStatus::Empty, - last_active: Utc::now(), + last_active: None, error: None, pspec: None, metrics: ComputeMetrics::default(), @@ -249,38 +250,10 @@ impl ComputeNode { /// safekeepers sync, basebackup, etc. #[instrument(skip(self, compute_state))] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { - #[derive(Clone)] - enum Replication { - Primary, - Static { lsn: Lsn }, - HotStandby, - } - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); - let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") { - if let Some(value) = &option.value { - anyhow::ensure!(option.vartype == "bool"); - matches!(value.as_str(), "on" | "yes" | "true") - } else { - false - } - } else { - false - }; - - let replication = if hot_replica { - Replication::HotStandby - } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") { - Replication::Static { - lsn: Lsn::from_str(&lsn)?, - } - } else { - Replication::Primary - }; - // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; @@ -288,8 +261,8 @@ impl ComputeNode { // Syncing safekeepers is only safe with primary nodes: if a primary // is already connected it will be kicked out, so a secondary (standby) // cannot sync safekeepers. - let lsn = match &replication { - Replication::Primary => { + let lsn = match spec.mode { + ComputeMode::Primary => { info!("starting safekeepers syncing"); let lsn = self .sync_safekeepers(pspec.storage_auth_token.clone()) @@ -297,11 +270,11 @@ impl ComputeNode { info!("safekeepers synced at LSN {}", lsn); lsn } - Replication::Static { lsn } => { + ComputeMode::Static(lsn) => { info!("Starting read-only node at static LSN {}", lsn); - *lsn + lsn } - Replication::HotStandby => { + ComputeMode::Replica => { info!("Initializing standby from latest Pageserver LSN"); Lsn(0) } @@ -321,9 +294,9 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; - match &replication { - Replication::Primary | Replication::Static { .. } => {} - Replication::HotStandby => { + match spec.mode { + ComputeMode::Primary | ComputeMode::Static(..) => {} + ComputeMode::Replica => { add_standby_signal(pgdata_path)?; } } @@ -430,11 +403,13 @@ impl ComputeNode { self.pg_reload_conf(&mut client)?; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, self.connstr.as_str(), &mut client)?; - handle_extensions(&spec, &mut client)?; + if spec.mode == ComputeMode::Primary { + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants(&spec, self.connstr.as_str(), &mut client)?; + handle_extensions(&spec, &mut client)?; + } // 'Close' connection drop(client); @@ -467,7 +442,9 @@ impl ComputeNode { let pg = self.start_postgres(spec.storage_auth_token.clone())?; - self.apply_config(&compute_state)?; + if spec.spec.mode == ComputeMode::Primary { + self.apply_config(&compute_state)?; + } let startup_end_time = Utc::now(); { diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d25eb9b2fc..1168f3876a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::ComputeSpec; +use compute_api::spec::{ComputeMode, ComputeSpec}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -34,17 +34,25 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. - let mut postgres_conf = File::create(path)?; + let mut file = File::create(path)?; - write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; - - Ok(()) -} - -// Write Postgres config block wrapped with generated comment section -fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { writeln!(file, "# Managed by compute_ctl: begin")?; - writeln!(file, "{}", buf)?; + + write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; + + match spec.mode { + ComputeMode::Primary => {} + ComputeMode::Static(lsn) => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + writeln!(file, "recovery_target_lsn='{lsn}'")?; + } + ComputeMode::Replica => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + } + } + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index cc8f074a50..2680269756 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -181,8 +181,8 @@ components: ComputeState: type: object required: + - start_time - status - - last_active properties: start_time: type: string @@ -195,11 +195,13 @@ components: $ref: '#/components/schemas/ComputeStatus' last_active: type: string - description: The last detected compute activity timestamp in UTC and RFC3339 format. + description: | + The last detected compute activity timestamp in UTC and RFC3339 format. + It could be empty if compute was never used by user since start. example: "2022-10-12T07:20:50.52Z" error: type: string - description: Text of the error during compute startup, if any. + description: Text of the error during compute startup or reconfiguration, if any. example: "" tenant: type: string @@ -222,9 +224,12 @@ components: ComputeStatus: type: string enum: + - empty - init - failed - running + - configuration_pending + - configuration example: running # diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index a30b52aed4..d2e7b698dd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -74,7 +74,7 @@ fn watch_compute_activity(compute: &ComputeNode) { // Found non-idle backend, so the last activity is NOW. // Save it and exit the for loop. Also clear the idle backend // `state_change` timestamps array as it doesn't matter now. - last_active = Utc::now(); + last_active = Some(Utc::now()); idle_backs.clear(); break; } @@ -82,15 +82,16 @@ fn watch_compute_activity(compute: &ComputeNode) { // Get idle backend `state_change` with the max timestamp. if let Some(last) = idle_backs.iter().max() { - last_active = *last; + last_active = Some(*last); } } // Update the last activity in the shared state if we got a more recent one. let mut state = compute.state.lock().unwrap(); + // NB: `Some()` is always greater than `None`. if last_active > state.last_active { state.last_active = last_active; - debug!("set the last compute activity time to: {}", last_active); + debug!("set the last compute activity time to: {:?}", last_active); } } Err(e) => { diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ba39747e03..a341ff0263 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -30,4 +30,5 @@ postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 09278e1726..30880565ab 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -7,8 +7,8 @@ //! use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint::Replication; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -481,7 +481,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - timeline_id, None, pg_version, - Replication::Primary, + ComputeMode::Primary, )?; println!("Done"); } @@ -568,8 +568,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match endpoint.replication { - Replication::Static(lsn) => { + let lsn_str = match endpoint.mode { + ComputeMode::Static(lsn) => { // -> read-only endpoint // Use the node's LSN. lsn.to_string() @@ -632,21 +632,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .unwrap_or(false); - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - cplane.new_endpoint( - tenant_id, - &endpoint_id, - timeline_id, - port, - pg_version, - replication, - )?; + cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); @@ -670,11 +663,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .unwrap_or(false); if let Some(endpoint) = endpoint { - match (&endpoint.replication, hot_standby) { - (Replication::Static(_), true) => { + match (&endpoint.mode, hot_standby) { + (ComputeMode::Static(_), true) => { bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") } - (Replication::Primary, true) => { + (ComputeMode::Primary, true) => { bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") } _ => {} @@ -701,10 +694,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .context("Failed to `pg-version` from the argument string")?; - let replication = match (lsn, hot_standby) { - (Some(lsn), false) => Replication::Static(lsn), - (None, true) => Replication::Replica, - (None, false) => Replication::Primary, + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; @@ -721,7 +714,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( timeline_id, port, pg_version, - replication, + mode, )?; ep.start(&auth_token)?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7d3485518f..6431f66e1c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -11,15 +11,33 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; -use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +use compute_api::spec::ComputeMode; + +// contents of a endpoint.json file +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EndpointConf { + name: String, + #[serde_as(as = "DisplayFromStr")] + tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + timeline_id: TimelineId, + mode: ComputeMode, + port: u16, + pg_version: u32, +} + // // ComputeControlPlane // @@ -70,7 +88,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, port: Option, pg_version: u32, - replication: Replication, + mode: ComputeMode, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); @@ -80,12 +98,22 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, - replication, + mode, tenant_id, pg_version, }); - ep.create_pgdata()?; + std::fs::write( + ep.endpoint_path().join("endpoint.json"), + serde_json::to_string_pretty(&EndpointConf { + name: name.to_string(), + tenant_id, + timeline_id, + mode, + port, + pg_version, + })?, + )?; ep.setup_pg_conf()?; self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); @@ -96,26 +124,13 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// -#[derive(Debug, Clone, Eq, PartialEq)] -pub enum Replication { - // Regular read-write node - Primary, - // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN - Static(Lsn), - // Hot standby; read-only replica. - // Future versions may want to distinguish between replicas with hot standby - // feedback and other kinds of replication configurations. - Replica, -} - #[derive(Debug)] pub struct Endpoint { /// used as the directory name name: String, pub tenant_id: TenantId, pub timeline_id: TimelineId, - // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. - pub replication: Replication, + pub mode: ComputeMode, // port and address of the Postgres server pub address: SocketAddr, @@ -144,50 +159,20 @@ impl Endpoint { let fname = entry.file_name(); let name = fname.to_str().unwrap().to_string(); - // Read config file into memory - let cfg_path = entry.path().join("pgdata").join("postgresql.conf"); - let cfg_path_str = cfg_path.to_string_lossy(); - let mut conf_file = File::open(&cfg_path) - .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; - let conf = PostgresConf::read(&mut conf_file) - .with_context(|| format!("failed to read config file in {}", cfg_path_str))?; - - // Read a few options from the config file - let context = format!("in config file {}", cfg_path_str); - let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - - // Read postgres version from PG_VERSION file to determine which postgres version binary to use. - // If it doesn't exist, assume broken data directory and use default pg version. - let pg_version_path = entry.path().join("PG_VERSION"); - - let pg_version_str = - fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); - let pg_version = u32::from_str(&pg_version_str)?; - - // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any - let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") { - Replication::Static(Lsn::from_str(lsn_str)?) - } else if let Some(slot_name) = conf.get("primary_slot_name") { - let slot_name = slot_name.to_string(); - let prefix = format!("repl_{}_", timeline_id); - assert!(slot_name.starts_with(&prefix)); - Replication::Replica - } else { - Replication::Primary - }; + // Read the endpoint.json file + let conf: EndpointConf = + serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; // ok now Ok(Endpoint { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), name, env: env.clone(), pageserver: Arc::clone(pageserver), - timeline_id, - replication, - tenant_id, - pg_version, + timeline_id: conf.timeline_id, + mode: conf.mode, + tenant_id: conf.tenant_id, + pg_version: conf.pg_version, }) } @@ -323,8 +308,8 @@ impl Endpoint { conf.append_line(""); // Replication-related configurations, such as WAL sending - match &self.replication { - Replication::Primary => { + match &self.mode { + ComputeMode::Primary => { // Configure backpressure // - Replication write lag depends on how fast the walreceiver can process incoming WAL. // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, @@ -366,10 +351,10 @@ impl Endpoint { conf.append("synchronous_standby_names", "pageserver"); } } - Replication::Static(lsn) => { + ComputeMode::Static(lsn) => { conf.append("recovery_target_lsn", &lsn.to_string()); } - Replication::Replica => { + ComputeMode::Replica => { assert!(!self.env.safekeepers.is_empty()); // TODO: use future host field from safekeeper spec @@ -409,8 +394,8 @@ impl Endpoint { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = match &self.replication { - Replication::Primary => { + let backup_lsn = match &self.mode { + ComputeMode::Primary => { if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap @@ -426,8 +411,8 @@ impl Endpoint { None } } - Replication::Static(lsn) => Some(*lsn), - Replication::Replica => { + ComputeMode::Static(lsn) => Some(*lsn), + ComputeMode::Replica => { None // Take the latest snapshot available to start with } }; @@ -526,7 +511,7 @@ impl Endpoint { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.replication != Replication::Primary { + if self.mode != ComputeMode::Primary { File::create(self.pgdata().join("standby.signal"))?; } diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 533a091207..428d031a93 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -11,4 +11,5 @@ serde.workspace = true serde_with.workspace = true serde_json.workspace = true +utils = { path = "../utils" } workspace_hack.workspace = true diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index c409563b56..d181c018b1 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -19,7 +19,7 @@ pub struct ComputeStatusResponse { pub timeline: Option, pub status: ComputeStatus, #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, + pub last_active: Option>, pub error: Option, } @@ -29,7 +29,7 @@ pub struct ComputeState { pub status: ComputeStatus, /// Timestamp of the last Postgres activity #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, + pub last_active: Option>, pub error: Option, } @@ -54,11 +54,15 @@ pub enum ComputeStatus { Failed, } -fn rfc3339_serialize(x: &DateTime, s: S) -> Result +fn rfc3339_serialize(x: &Option>, s: S) -> Result where S: Serializer, { - x.to_rfc3339().serialize(s) + if let Some(x) = x { + x.to_rfc3339().serialize(s) + } else { + s.serialize_none() + } } /// Response of the /metrics.json API diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index f771910329..87c0edd687 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -3,8 +3,10 @@ //! The spec.json file is used to pass information to 'compute_ctl'. It contains //! all the information needed to start up the right version of PostgreSQL, //! and connect it to the storage nodes. -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; +use utils::lsn::Lsn; /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. @@ -12,6 +14,7 @@ pub type PgIdent = String; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. +#[serde_as] #[derive(Clone, Debug, Default, Deserialize)] pub struct ComputeSpec { pub format_version: f32, @@ -24,11 +27,29 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + #[serde(default)] + pub mode: ComputeMode, + pub storage_auth_token: Option, pub startup_tracing_context: Option>, } +#[serde_as] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeMode { + /// A read-write node + #[default] + Primary, + /// A read-only node, pinned at a particular LSN + Static(#[serde_as(as = "DisplayFromStr")] Lsn), + /// A read-only node that follows the tip of the branch in hot standby mode + /// + /// Future versions may want to distinguish between replicas with hot standby + /// feedback and other kinds of replication configurations. + Replica, +} + #[derive(Clone, Debug, Default, Deserialize)] pub struct Cluster { pub cluster_id: String, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f6bf7c6fc2..453c58431a 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -50,11 +50,14 @@ impl QueryError { } } +/// Returns true if the given error is a normal consequence of a network issue, +/// or the client closing the connection. These errors can happen during normal +/// operations, and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut + BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut ) } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 88ae41c636..9f3f4dc20d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,5 +1,4 @@ -use anyhow::*; -use core::time::Duration; +use anyhow::{bail, ensure}; use log::*; use postgres::types::PgLsn; use postgres::Client; @@ -8,7 +7,7 @@ use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; use std::path::{Path, PathBuf}; use std::process::Command; -use std::time::Instant; +use std::time::{Duration, Instant}; use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -55,7 +54,7 @@ impl Conf { self.datadir.join("pg_wal") } - fn new_pg_command(&self, command: impl AsRef) -> Result { + fn new_pg_command(&self, command: impl AsRef) -> anyhow::Result { let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); @@ -65,7 +64,7 @@ impl Conf { Ok(cmd) } - pub fn initdb(&self) -> Result<()> { + pub fn initdb(&self) -> anyhow::Result<()> { if let Some(parent) = self.datadir.parent() { info!("Pre-creating parent directory {:?}", parent); // Tests may be run concurrently and there may be a race to create `test_output/`. @@ -79,7 +78,7 @@ impl Conf { let output = self .new_pg_command("initdb")? .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); @@ -92,7 +91,7 @@ impl Conf { Ok(()) } - pub fn start_server(&self) -> Result { + pub fn start_server(&self) -> anyhow::Result { info!("Starting Postgres server in {:?}", self.datadir); let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); @@ -100,9 +99,9 @@ impl Conf { .new_pg_command("postgres")? .args(["-c", "listen_addresses="]) .arg("-k") - .arg(unix_socket_dir_path.as_os_str()) + .arg(&unix_socket_dir_path) .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .spawn()?; let server = PostgresServer { @@ -123,7 +122,7 @@ impl Conf { &self, first_segment_name: &str, last_segment_name: &str, - ) -> Result { + ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); info!( @@ -133,10 +132,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args([ - &first_segment_file.as_os_str(), - &last_segment_file.as_os_str(), - ]) + .args([&first_segment_file, &last_segment_file]) .output()?; debug!("waldump output: {:?}", output); Ok(output) @@ -144,10 +140,9 @@ impl Conf { } impl PostgresServer { - pub fn connect_with_timeout(&self) -> Result { + pub fn connect_with_timeout(&self) -> anyhow::Result { let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); while Instant::now() < retry_until { - use std::result::Result::Ok; if let Ok(client) = self.client_config.connect(postgres::NoTls) { return Ok(client); } @@ -164,7 +159,6 @@ impl PostgresServer { impl Drop for PostgresServer { fn drop(&mut self) { - use std::result::Result::Ok; match self.process.try_wait() { Ok(Some(_)) => return, Ok(None) => { @@ -179,12 +173,12 @@ impl Drop for PostgresServer { } pub trait PostgresClientExt: postgres::GenericClient { - fn pg_current_wal_insert_lsn(&mut self) -> Result { + fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_insert_lsn()", &[])? .get(0)) } - fn pg_current_wal_flush_lsn(&mut self) -> Result { + fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_flush_lsn()", &[])? .get(0)) @@ -193,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); @@ -227,13 +221,13 @@ pub trait Crafter { /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. /// May include or exclude Lsn(0) and the end-of-wal. /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; } fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, -) -> Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, +) -> anyhow::Result<(Vec, PgLsn)> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; @@ -265,7 +259,7 @@ fn craft_internal( pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok((Vec::new(), None)) @@ -276,7 +270,7 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -298,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch { pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -365,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result<(Vec, PgLsn)> { +) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -407,7 +401,7 @@ fn craft_single_logical_message( pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, true) } } @@ -415,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, false) } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 2143ad2530..8e361b757c 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -613,7 +613,7 @@ pub struct XLogDataBody<'a> { #[derive(Debug)] pub struct WalSndKeepAlive { - pub sent_ptr: u64, + pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub request_reply: bool, } @@ -924,7 +924,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'k'); - buf.put_u64(req.sent_ptr); + buf.put_u64(req.wal_end); buf.put_i64(req.timestamp); buf.put_u8(u8::from(req.request_reply)); }); diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index b11aef9892..4bfb5bf994 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -131,7 +131,9 @@ impl RequestCancelled { impl Drop for RequestCancelled { fn drop(&mut self) { - if let Some(span) = self.warn.take() { + if std::thread::panicking() { + // we are unwinding due to panicking, assume we are not dropped for cancellation + } else if let Some(span) = self.warn.take() { // the span has all of the info already, but the outer `.instrument(span)` has already // been dropped, so we need to manually re-enter it for this message. // diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 82701ed4b0..0615d50c1a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -58,6 +58,8 @@ pub mod pageserver_feedback; pub mod tracing_span_assert; +pub mod rate_limit; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs new file mode 100644 index 0000000000..557955bb88 --- /dev/null +++ b/libs/utils/src/rate_limit.rs @@ -0,0 +1,66 @@ +//! A helper to rate limit operations. + +use std::time::{Duration, Instant}; + +pub struct RateLimit { + last: Option, + interval: Duration, +} + +impl RateLimit { + pub fn new(interval: Duration) -> Self { + Self { + last: None, + interval, + } + } + + /// Call `f` if the rate limit allows. + /// Don't call it otherwise. + pub fn call(&mut self, f: F) { + let now = Instant::now(); + match self.last { + Some(last) if now - last <= self.interval => { + // ratelimit + } + _ => { + self.last = Some(now); + f(); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicUsize; + + #[test] + fn basics() { + use super::RateLimit; + use std::sync::atomic::Ordering::Relaxed; + use std::time::Duration; + + let called = AtomicUsize::new(0); + let mut f = RateLimit::new(Duration::from_millis(100)); + + let cl = || { + called.fetch_add(1, Relaxed); + }; + + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 3); + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b1251123b2..58ee7367ca 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -691,16 +691,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -// Helper function to standardize the error messages we produce on bad durations -// -// Intended to be used with anyhow's `with_context`, e.g.: -// -// let value = result.with_context(bad_duration("name", &value))?; -// -fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { - move || format!("Cannot parse `{field_name}` duration {value:?}") -} - async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -708,91 +698,7 @@ async fn tenant_create_handler(mut request: Request) -> Result = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +// walreceiver metrics + +pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_started_connections_total", + "Number of started walreceiver connections" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_walreceiver_active_managers", + "Number of active walreceiver managers" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_switches_total", + "Number of walreceiver manager change_connection calls", + &["reason"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_broker_updates_total", + "Number of received broker updates in walreceiver" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_candidates_events_total", + "Number of walreceiver candidate events", + &["event"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_ADDED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); + +pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5cfc466111..d69d5e4b45 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -271,7 +271,10 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - self.initialize(ctx) + // Initialize without loading the layer map. We started with an empty layer map, and already + // updated it for the layers that we created during the import. + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(ctx, &mut timelines, false, true) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -2352,6 +2355,8 @@ impl Tenant { ) })?; + // Initialize the timeline without loading the layer map, because we already updated the layer + // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 34f57840fb..50ce942a09 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,6 +8,8 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use anyhow::Context; +use pageserver_api::models::{TenantConfigRequest, TenantCreateRequest}; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::time::Duration; @@ -280,6 +282,179 @@ impl Default for TenantConf { } } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + +impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &TenantCreateRequest) -> Result { + let mut tenant_conf = TenantConfOpt::default(); + + if let Some(gc_period) = &request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(gc_period) + .with_context(bad_duration("gc_period", gc_period))?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = &request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(pitr_interval) + .with_context(bad_duration("pitr_interval", pitr_interval))?, + ); + } + + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(walreceiver_connect_timeout).with_context( + bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), + )?, + ); + } + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, + ); + } + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + if let Some(trace_read_requests) = request_data.trace_read_requests { + tenant_conf.trace_read_requests = Some(trace_read_requests); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, + ); + } + + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = &request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(compaction_period) + .with_context(bad_duration("compaction_period", compaction_period))?, + ); + } + + if let Some(eviction_policy) = &request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde::Deserialize::deserialize(eviction_policy) + .context("parse field `eviction_policy`")?, + ); + } + + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + + if let Some(evictions_low_residence_duration_metric_threshold) = + &request_data.evictions_low_residence_duration_metric_threshold + { + tenant_conf.evictions_low_residence_duration_metric_threshold = Some( + humantime::parse_duration(evictions_low_residence_duration_metric_threshold) + .with_context(bad_duration( + "evictions_low_residence_duration_metric_threshold", + evictions_low_residence_duration_metric_threshold, + ))?, + ); + } + + Ok(tenant_conf) + } +} + +impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &TenantConfigRequest) -> Result { + let mut tenant_conf = TenantConfOpt::default(); + if let Some(gc_period) = &request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(gc_period) + .with_context(bad_duration("gc_period", gc_period))?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = &request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(pitr_interval) + .with_context(bad_duration("pitr_interval", pitr_interval))?, + ); + } + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(walreceiver_connect_timeout).with_context( + bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), + )?, + ); + } + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, + ); + } + tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag; + tenant_conf.trace_read_requests = request_data.trace_read_requests; + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, + ); + } + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = &request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(compaction_period) + .with_context(bad_duration("compaction_period", compaction_period))?, + ); + } + + if let Some(eviction_policy) = &request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde::Deserialize::deserialize(eviction_policy) + .context("parse field `eviction_policy`")?, + ); + } + + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + + if let Some(evictions_low_residence_duration_metric_threshold) = + &request_data.evictions_low_residence_duration_metric_threshold + { + tenant_conf.evictions_low_residence_duration_metric_threshold = Some( + humantime::parse_duration(evictions_low_residence_duration_metric_threshold) + .with_context(bad_duration( + "evictions_low_residence_duration_metric_threshold", + evictions_low_residence_duration_metric_threshold, + ))?, + ); + } + + Ok(tenant_conf) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 2ee723e7c3..d30d6c5c6e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -13,9 +13,9 @@ use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; -use either::Either; use enum_map::EnumMap; use enumset::EnumSet; +use once_cell::sync::Lazy; use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, @@ -23,8 +23,10 @@ use pageserver_api::models::{ use std::ops::Range; use std::path::PathBuf; use std::sync::{Arc, Mutex}; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; +use utils::rate_limit::RateLimit; use utils::{ id::{TenantId, TimelineId}, @@ -37,6 +39,8 @@ pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use remote_layer::RemoteLayer; +use super::layer_map::BatchedUpdates; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -158,41 +162,82 @@ impl LayerAccessStatFullDetails { } impl LayerAccessStats { - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + /// Create an empty stats object. + /// + /// The caller is responsible for recording a residence event + /// using [`record_residence_event`] before calling `latest_activity`. + /// If they don't, [`latest_activity`] will return `None`. + pub(crate) fn empty_will_record_residence_event_later() -> Self { + LayerAccessStats(Mutex::default()) } - pub(crate) fn for_new_layer_file() -> Self { + /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn for_loading_layer( + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + status: LayerResidenceStatus, + ) -> Self + where + L: ?Sized + Layer, + { let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, + layer_map_lock_held_witness, + status, + LayerResidenceEventReason::LayerLoad, ); new } /// Creates a clone of `self` and records `new_status` in the clone. - /// The `new_status` is not recorded in `self` - pub(crate) fn clone_for_residence_change( + /// + /// The `new_status` is not recorded in `self`. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn clone_for_residence_change( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, new_status: LayerResidenceStatus, - ) -> LayerAccessStats { + ) -> LayerAccessStats + where + L: ?Sized + Layer, + { let clone = { let inner = self.0.lock().unwrap(); inner.clone() }; let new = LayerAccessStats(Mutex::new(clone)); - new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange); + new.record_residence_event( + layer_map_lock_held_witness, + new_status, + LayerResidenceEventReason::ResidenceChange, + ); new } - fn record_residence_event( + /// Record a change in layer residency. + /// + /// Recording the event must happen while holding the layer map lock to + /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs) + /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`. + /// + /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock, + /// the following race could happen: + /// + /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp. + /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. + /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. + /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. + /// + pub(crate) fn record_residence_event( &self, + _layer_map_lock_held_witness: &BatchedUpdates<'_, L>, status: LayerResidenceStatus, reason: LayerResidenceEventReason, - ) { + ) where + L: ?Sized + Layer, + { let mut locked = self.0.lock().unwrap(); locked.iter_mut().for_each(|inner| { inner @@ -255,24 +300,37 @@ impl LayerAccessStats { ret } - fn most_recent_access_or_residence_event( - &self, - ) -> Either { + /// Get the latest access timestamp, falling back to latest residence event. + /// + /// This function can only return `None` if there has not yet been a call to the + /// [`record_residence_event`] method. That would generally be considered an + /// implementation error. This function logs a rate-limited warning in that case. + /// + /// TODO: use type system to avoid the need for `fallback`. + /// The approach in https://github.com/neondatabase/neon/pull/3775 + /// could be used to enforce that a residence event is recorded + /// before a layer is added to the layer map. We could also have + /// a layer wrapper type that holds the LayerAccessStats, and ensure + /// that that type can only be produced by inserting into the layer map. + pub(crate) fn latest_activity(&self) -> Option { let locked = self.0.lock().unwrap(); let inner = &locked.for_eviction_policy; match inner.last_accesses.recent() { - Some(a) => Either::Left(*a), + Some(a) => Some(a.when), None => match inner.last_residence_changes.recent() { - Some(e) => Either::Right(e.clone()), - None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"), - } - } - } - - pub(crate) fn latest_activity(&self) -> SystemTime { - match self.most_recent_access_or_residence_event() { - Either::Left(mra) => mra.when, - Either::Right(re) => re.timestamp, + Some(e) => Some(e.timestamp), + None => { + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.0 += 1; + let occurences = guard.0; + guard.1.call(move || { + warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); + }); + None + } + }, } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 98cbcc5f07..ba3ab6dd4c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,7 +57,7 @@ use utils::{ use super::{ DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter, - LayerKeyIter, LayerResidenceStatus, PathOrConf, + LayerKeyIter, PathOrConf, }; /// @@ -637,7 +637,7 @@ impl DeltaLayer { key_range: summary.key_range, lsn_range: summary.lsn_range, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -808,7 +808,7 @@ impl DeltaLayerWriterInner { key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index a99b1b491f..d298b3e852 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,7 +53,7 @@ use utils::{ }; use super::filename::{ImageFileName, LayerFileName}; -use super::{Layer, LayerAccessStatsReset, LayerIter, LayerResidenceStatus, PathOrConf}; +use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf}; /// /// Header stored in the beginning of the file @@ -438,7 +438,7 @@ impl ImageLayer { key_range: summary.key_range, lsn: summary.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -598,7 +598,7 @@ impl ImageLayerWriterInner { key_range: self.key_range.clone(), lsn: self.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 2eb7eb0cb6..2106587ab2 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -4,6 +4,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; +use crate::tenant::layer_map::BatchedUpdates; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use anyhow::{bail, Result}; @@ -246,11 +247,15 @@ impl RemoteLayer { } /// Create a Layer struct representing this layer, after it has been downloaded. - pub fn create_downloaded_layer( + pub fn create_downloaded_layer( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, conf: &'static PageServerConf, file_size: u64, - ) -> Arc { + ) -> Arc + where + L: ?Sized + Layer, + { if self.is_delta { let fname = DeltaFileName { key_range: self.key_range.clone(), @@ -262,8 +267,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } else { let fname = ImageFileName { @@ -276,8 +283,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8768841d87..90f2951aef 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -11,7 +11,8 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, - DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, + DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus, + TimelineState, }; use remote_storage::GenericRemoteStorage; use storage_broker::BrokerClientChannel; @@ -588,15 +589,25 @@ impl Timeline { let _timer = self.metrics.wait_lsn_time_histo.start_timer(); - self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await - .with_context(|| - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - )?; - - Ok(()) + match self + .last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .await + { + Ok(()) => Ok(()), + seqwait_error => { + drop(_timer); + let walreceiver_status = self.walreceiver.status().await; + seqwait_error.with_context(|| format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status.map(|status| status.to_human_readable_string()) + .unwrap_or_else(|| "WalReceiver status: Not active".to_string()), + )) + } + } } /// Check that it is valid to request operations with that lsn. @@ -1101,7 +1112,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), LayerFileName::Delta(delta_name) => RemoteLayer::new_delta( self.tenant_id, @@ -1110,7 +1121,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), }); @@ -1479,7 +1490,7 @@ impl Timeline { self.tenant_id, &imgfilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1511,7 +1522,7 @@ impl Timeline { self.tenant_id, &deltafilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1647,7 +1658,10 @@ impl Timeline { self.timeline_id, imgfilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); @@ -1672,7 +1686,10 @@ impl Timeline { self.timeline_id, deltafilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); updates.insert_historic(remote_layer); @@ -2719,11 +2736,16 @@ impl Timeline { ])?; // Add it to the layer map - self.layers - .write() - .unwrap() - .batch_update() - .insert_historic(Arc::new(new_delta)); + let l = Arc::new(new_delta); + let mut layers = self.layers.write().unwrap(); + let mut batch_updates = layers.batch_update(); + l.access_stats().record_residence_event( + &batch_updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + batch_updates.insert_historic(l); + batch_updates.flush(); // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2928,7 +2950,13 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l)); + let l = Arc::new(l); + l.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + updates.insert_historic(l); } updates.flush(); drop(layers); @@ -3361,6 +3389,11 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); + x.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); updates.insert_historic(x); } @@ -3871,9 +3904,9 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. - let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); let mut layers = self_clone.layers.write().unwrap(); let mut updates = layers.batch_update(); + let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); @@ -4145,7 +4178,15 @@ impl Timeline { continue; } - let last_activity_ts = l.access_stats().latest_activity(); + let last_activity_ts = l + .access_stats() + .latest_activity() + .unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); resident_layers.push(LocalLayerInfoForDiskUsageEviction { layer: l, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index cf799a8808..58321762ea 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -184,7 +184,14 @@ impl Timeline { if hist_layer.is_remote_layer() { continue; } - let last_activity_ts = hist_layer.access_stats().latest_activity(); + + let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); + let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 00f446af38..91f7208194 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -38,12 +38,14 @@ use std::sync::{Arc, Weak}; use std::time::Duration; use storage_broker::BrokerClientChannel; use tokio::select; -use tokio::sync::watch; +use tokio::sync::{watch, RwLock}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; +use self::connection_manager::ConnectionManagerStatus; + use super::Timeline; #[derive(Clone)] @@ -63,6 +65,7 @@ pub struct WalReceiver { timeline_ref: Weak, conf: WalReceiverConf, started: AtomicBool, + manager_status: Arc>>, } impl WalReceiver { @@ -76,6 +79,7 @@ impl WalReceiver { timeline_ref, conf, started: AtomicBool::new(false), + manager_status: Arc::new(RwLock::new(None)), } } @@ -96,8 +100,8 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let wal_receiver_conf = self.conf.clone(); + let loop_status = Arc::clone(&self.manager_status); task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, @@ -115,24 +119,28 @@ impl WalReceiver { select! { _ = task_mgr::shutdown_watcher() => { info!("WAL receiver shutdown requested, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; }, loop_step_result = connection_manager_loop_step( &mut broker_client, &mut connection_manager_state, &walreceiver_ctx, + &loop_status, ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { info!("Connection manager loop ended, shutting down"); - connection_manager_state.shutdown().await; - return Ok(()); + break; } }, } } - }.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + + connection_manager_state.shutdown().await; + *loop_status.write().await = None; + Ok(()) + } + .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) ); self.started.store(true, atomic::Ordering::Release); @@ -149,6 +157,10 @@ impl WalReceiver { .await; self.started.store(false, atomic::Ordering::Release); } + + pub(super) async fn status(&self) -> Option { + self.manager_status.read().await.clone() + } } /// A handle of an asynchronous task. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 731c5c4644..9cb17ea799 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -13,6 +13,10 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti use super::{TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{ + WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, + WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, +}; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use anyhow::Context; @@ -24,6 +28,7 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; +use tokio::sync::RwLock; use tokio::{select, sync::watch}; use tracing::*; @@ -43,6 +48,7 @@ pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + manager_status: &RwLock>, ) -> ControlFlow<(), ()> { let mut timeline_state_updates = connection_manager_state .timeline @@ -56,6 +62,11 @@ pub(super) async fn connection_manager_loop_step( } } + WALRECEIVER_ACTIVE_MANAGERS.inc(); + scopeguard::defer! { + WALRECEIVER_ACTIVE_MANAGERS.dec(); + } + let id = TenantTimelineId { tenant_id: connection_manager_state.timeline.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, @@ -180,6 +191,7 @@ pub(super) async fn connection_manager_loop_step( .change_connection(new_candidate, ctx) .await } + *manager_status.write().await = Some(connection_manager_state.manager_status()); } } @@ -267,6 +279,78 @@ pub(super) struct ConnectionManagerState { wal_stream_candidates: HashMap, } +/// An information about connection manager's current connection and connection candidates. +#[derive(Debug, Clone)] +pub struct ConnectionManagerStatus { + existing_connection: Option, + wal_stream_candidates: HashMap, +} + +impl ConnectionManagerStatus { + /// Generates a string, describing current connection status in a form, suitable for logging. + pub fn to_human_readable_string(&self) -> String { + let mut resulting_string = "WalReceiver status".to_string(); + match &self.existing_connection { + Some(connection) => { + if connection.has_processed_wal { + resulting_string.push_str(&format!( + " (update {}): streaming WAL from node {}, ", + connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + + match (connection.streaming_lsn, connection.commit_lsn) { + (None, None) => resulting_string.push_str("no streaming data"), + (None, Some(commit_lsn)) => { + resulting_string.push_str(&format!("commit Lsn: {commit_lsn}")) + } + (Some(streaming_lsn), None) => { + resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}")) + } + (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str( + &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"), + ), + } + } else if connection.is_connected { + resulting_string.push_str(&format!( + " (update {}): connecting to node {}", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } else { + resulting_string.push_str(&format!( + " (update {}): initializing node {} connection", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } + } + None => resulting_string.push_str(": disconnected"), + } + + resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): ["); + let mut candidates = self.wal_stream_candidates.iter().peekable(); + while let Some((node_id, candidate_info)) = candidates.next() { + resulting_string.push_str(&format!( + "({}|{}|{})", + node_id, + candidate_info.latest_update.format("%H:%M:%S"), + Lsn(candidate_info.timeline.commit_lsn) + )); + if candidates.peek().is_some() { + resulting_string.push_str(", "); + } + } + resulting_string.push(']'); + + resulting_string + } +} + /// Current connection data. #[derive(Debug)] struct WalConnection { @@ -293,14 +377,14 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] struct RetryInfo { next_retry_at: Option, retry_duration_seconds: f64, } /// Data about the timeline to connect to, received from the broker. -#[derive(Debug)] +#[derive(Debug, Clone)] struct BrokerSkTimeline { timeline: SafekeeperTimelineInfo, /// Time at which the data was fetched from the broker last time, to track the stale data. @@ -325,9 +409,14 @@ impl ConnectionManagerState { /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { + WALRECEIVER_SWITCHES + .with_label_values(&[new_sk.reason.name()]) + .inc(); + self.drop_old_connection(true).await; let id = self.id; + let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( @@ -343,12 +432,13 @@ impl ConnectionManagerState { cancellation, connect_timeout, ctx, + node_id, ) .await .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), ) }); @@ -364,6 +454,7 @@ impl ConnectionManagerState { latest_wal_update: now, streaming_lsn: None, commit_lsn: None, + node: node_id, }, connection_task: connection_handle, discovered_new_wal: None, @@ -437,6 +528,8 @@ impl ConnectionManagerState { /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + WALRECEIVER_BROKER_UPDATES.inc(); + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -448,6 +541,7 @@ impl ConnectionManagerState { if old_entry.is_none() { info!("New SK node was added: {new_safekeeper_id}"); + WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -716,6 +810,7 @@ impl ConnectionManagerState { for node_id in node_ids_to_remove { info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); self.wal_connection_retries.remove(&node_id); + WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } @@ -725,6 +820,13 @@ impl ConnectionManagerState { wal_connection.connection_task.shutdown().await; } } + + fn manager_status(&self) -> ConnectionManagerStatus { + ConnectionManagerStatus { + existing_connection: self.wal_connection.as_ref().map(|conn| conn.status), + wal_stream_candidates: self.wal_stream_candidates.clone(), + } + } } #[derive(Debug)] @@ -732,8 +834,6 @@ struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, availability_zone: Option, - // This field is used in `derive(Debug)` only. - #[allow(dead_code)] reason: ReconnectReason, } @@ -762,6 +862,18 @@ enum ReconnectReason { }, } +impl ReconnectReason { + fn name(&self) -> &str { + match self { + ReconnectReason::NoExistingConnection => "NoExistingConnection", + ReconnectReason::LaggingWal { .. } => "LaggingWal", + ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone", + ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout", + ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives", + } + } +} + fn wal_stream_connection_config( TenantTimelineId { tenant_id, @@ -867,6 +979,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), streaming_lsn: Some(Lsn(current_lsn)), + node: NodeId(1), }; state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); @@ -1035,6 +1148,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { @@ -1101,6 +1215,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1164,6 +1279,7 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { @@ -1261,6 +1377,7 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d80c7c5673..1cbed3416c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -24,8 +24,8 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use super::TaskStateUpdate; -use crate::context::RequestContext; use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -37,8 +37,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use utils::{id::NodeId, lsn::Lsn}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -56,6 +56,8 @@ pub(super) struct WalConnectionStatus { pub streaming_lsn: Option, /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. pub commit_lsn: Option, + /// The node it is connected to + pub node: NodeId, } /// Open a connection to the given safekeeper and receive WAL, sending back progress @@ -67,7 +69,10 @@ pub(super) async fn handle_walreceiver_connection( cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, + node: NodeId, ) -> anyhow::Result<()> { + WALRECEIVER_STARTED_CONNECTIONS.inc(); + // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); @@ -100,6 +105,7 @@ pub(super) async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, + node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); @@ -122,7 +128,7 @@ pub(super) async fn handle_walreceiver_connection( false, async move { select! { - connection_result = connection => match connection_result{ + connection_result = connection => match connection_result { Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { if let Err(e) = ignore_expected_errors(connection_error) { diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index cc46fb5a25..1ab2ae668a 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -96,6 +96,8 @@ static shmem_request_hook_type prev_shmem_request_hook; #endif static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ +void FileCacheMonitorMain(Datum main_arg); + static void lfc_shmem_startup(void) { @@ -378,7 +380,6 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; - ssize_t rc; bool found; int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); uint32 hash; diff --git a/poetry.lock b/poetry.lock index 7b368cd3b4..141371c925 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" @@ -968,14 +968,14 @@ testing = ["pre-commit"] [[package]] name = "flask" -version = "2.1.3" +version = "2.2.5" description = "A simple framework for building complex web applications." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, - {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, + {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, + {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] [package.dependencies] @@ -983,7 +983,7 @@ click = ">=8.0" importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" -Werkzeug = ">=2.0" +Werkzeug = ">=2.2.2" [package.extras] async = ["asgiref (>=3.2)"] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index b6e8497809..393570df6a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -25,6 +25,7 @@ parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true regex.workspace = true +scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 6a98d8fd84..5e25d22ec1 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -14,10 +14,13 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; use std::time::Duration; +use std::time::Instant; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_PULLED_UPDATES; +use crate::metrics::BROKER_PUSHED_UPDATES; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -49,12 +52,17 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. + let now = Instant::now(); let mut active_tlis = GlobalTimelines::get_all(); active_tlis.retain(|tli| tli.is_active()); for tli in &active_tlis { let sk_info = tli.get_safekeeper_info(&conf); yield sk_info; + BROKER_PUSHED_UPDATES.inc(); } + let elapsed = now.elapsed(); + // Log duration every second. Should be about 10MB of logs per day. + info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); sleep(push_interval).await; } }; @@ -79,6 +87,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { .context("subscribe_safekeper_info request failed")? .into_inner(); + let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]); + let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]); + let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); + while let Some(msg) = stream.message().await? { let proto_ttid = msg .tenant_timeline_id @@ -91,7 +103,15 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // connection to the broker. // note: there are blocking operations below, but it's considered fine for now - tli.record_safekeeper_info(msg).await? + let res = tli.record_safekeeper_info(msg).await; + if res.is_ok() { + ok_counter.inc(); + } else { + err_counter.inc(); + } + res?; + } else { + not_found.inc(); } } bail!("end of stream"); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c3d1cea0e..7d25ced449 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -10,7 +10,7 @@ use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::metrics::TrafficMetrics; +use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; @@ -72,6 +72,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } } +fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { + match cmd { + SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", + SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", + SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", + } +} + #[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler @@ -168,6 +177,12 @@ impl postgres_backend::Handler } let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); + scopeguard::defer! { + PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); + } info!( "got query {:?} in timeline {:?}", diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index eafee557d7..189af2b044 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -10,7 +10,7 @@ use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -73,6 +73,58 @@ pub static PG_IO_BYTES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_pg_io_bytes gauge") }); +pub static BROKER_PUSHED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_broker_pushed_updates_total", + "Number of timeline updates pushed to the broker" + ) + .expect("Failed to register safekeeper_broker_pushed_updates_total counter") +}); +pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_broker_pulled_updates_total", + "Number of timeline updates pulled and processed from the broker", + &["result"] + ) + .expect("Failed to register safekeeper_broker_pulled_updates_total counter") +}); +pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_received_total", + "Number of queries received through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_received_total counter") +}); +pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_finished_total", + "Number of queries finished through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_finished_total counter") +}); +pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_removed_wal_segments_total", + "Number of WAL segments removed from the disk" + ) + .expect("Failed to register safekeeper_removed_wal_segments_total counter") +}); +pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backed_up_segments_total", + "Number of WAL segments backed up to the broker" + ) + .expect("Failed to register safekeeper_backed_up_segments_total counter") +}); +pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backup_errors_total", + "Number of errors during backup" + ) + .expect("Failed to register safekeeper_backup_errors_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 6b303eb0fe..fb420cba64 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -384,6 +384,8 @@ impl SafekeeperPostgresHandler { self.appname.clone(), )); + let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); + // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will // hang otherwise). That's because walproposer runs the consensus and @@ -399,7 +401,14 @@ impl SafekeeperPostgresHandler { } else { None }; - let end_pos = stop_pos.unwrap_or(Lsn::INVALID); + + // take the latest commit_lsn if don't have stop_pos + let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow()); + + if end_pos < start_pos { + warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos); + end_pos = start_pos; + } info!( "starting streaming from {:?} till {:?}", @@ -429,7 +438,7 @@ impl SafekeeperPostgresHandler { start_pos, end_pos, stop_pos, - commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + commit_lsn_watch_rx, ws_guard: ws_guard.clone(), wal_reader, send_buf: [0; MAX_SEND_SIZE], @@ -456,6 +465,11 @@ struct WalSender<'a, IO> { // Position since which we are sending next chunk. start_pos: Lsn, // WAL up to this position is known to be locally available. + // Usually this is the same as the latest commit_lsn, but in case of + // walproposer recovery, this is flush_lsn. + // + // We send this LSN to the receiver as wal_end, so that it knows how much + // WAL this safekeeper has. This LSN should be as fresh as possible. end_pos: Lsn, // If present, terminate after reaching this position; used by walproposer // in recovery. @@ -530,10 +544,18 @@ impl WalSender<'_, IO> { /// exit in the meanwhile async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { + self.end_pos = *self.commit_lsn_watch_rx.borrow(); + if self.end_pos > self.start_pos { + // We have something to send. + return Ok(()); + } + + // Wait for WAL to appear, now self.end_pos == self.start_pos. if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? { self.end_pos = lsn; return Ok(()); } + // Timed out waiting for WAL, check for termination and send KA if let Some(remote_consistent_lsn) = self .ws_guard @@ -551,7 +573,7 @@ impl WalSender<'_, IO> { self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: self.end_pos.0, + wal_end: self.end_pos.0, timestamp: get_current_timestamp(), request_reply: true, })) @@ -618,11 +640,6 @@ const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); /// - Ok(None) if timeout expired; /// - Err in case of error (if watch channel is in trouble, shouldn't happen). async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { - let commit_lsn: Lsn = *rx.borrow(); - if commit_lsn > lsn { - return Ok(Some(commit_lsn)); - } - let res = timeout(POLL_STATE_TIMEOUT, async move { let mut commit_lsn; loop { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 163ac99be8..953c7d0022 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -25,6 +25,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; use crate::timeline::{PeerInfo, Timeline}; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -394,7 +395,13 @@ async fn backup_single_segment( ) })?; - backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?; + let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + if res.is_ok() { + BACKED_UP_SEGMENTS.inc(); + } else { + BACKUP_ERRORS.inc(); + } + res?; debug!("Backup of {} done", segment_file_path.display()); Ok(()) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5ef22b2f6a..1b82bd754e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -27,7 +27,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{time_io_closure, WalStorageMetrics}; +use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; @@ -455,6 +455,7 @@ fn remove_segments_from_disk( n_removed += 1; min_removed = min(min_removed, segno); max_removed = max(max_removed, segno); + REMOVED_WAL_SEGMENTS.inc(); } } } diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index 8df7248c4e..21386d3898 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -37,7 +37,7 @@ module.exports = async ({ github, context, fetch, reports }) => { const {buildType, reportUrl, jsonUrl} = report if (!reportUrl || !jsonUrl) { - console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`) + commentBody += `#### ${buildType} build: no tests were run or test report is not available\n` continue } @@ -78,7 +78,7 @@ module.exports = async ({ github, context, fetch, reports }) => { } const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length - commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n` + commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n` if (failedTests.length > 0) { commentBody += `Failed tests:\n` for (const test of failedTests) { diff --git a/test_runner/README.md b/test_runner/README.md index 877498bae7..96e74659ce 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -71,7 +71,8 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. -Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively, +you can use `--pg-version` argument. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 75242b84ce..4640861936 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,4 +1,5 @@ pytest_plugins = ( + "fixtures.pg_version", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 79b2e5b290..1a480e1b04 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -45,6 +45,7 @@ from typing_extensions import Literal from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -75,7 +76,6 @@ Env = Dict[str, str] DEFAULT_OUTPUT_DIR: str = "test_output" DEFAULT_BRANCH_NAME: str = "main" -DEFAULT_PG_VERSION_DEFAULT: str = "14" BASE_PORT: int = 15000 WORKER_PORT_NUM: int = 1000 @@ -148,18 +148,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def pg_version() -> Iterator[str]: - if env_default_pg_version := os.environ.get("DEFAULT_PG_VERSION"): - version = env_default_pg_version - else: - version = DEFAULT_PG_VERSION_DEFAULT - - log.info(f"pg_version is {version}") - yield version - - -@pytest.fixture(scope="session") -def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[Path]: +def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: versioned_dir = pg_distrib_dir / f"v{pg_version}" psql_bin_path = versioned_dir / "bin/psql" @@ -292,6 +281,12 @@ def port_distributor(worker_base_port: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +@pytest.fixture(scope="session") +def httpserver_listen_address(port_distributor: PortDistributor): + port = port_distributor.get_port() + return ("localhost", port) + + @pytest.fixture(scope="function") def default_broker( port_distributor: PortDistributor, @@ -586,7 +581,7 @@ class NeonEnvBuilder: mock_s3_server: MockS3Server, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, remote_storage: Optional[RemoteStorage] = None, remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, @@ -1035,7 +1030,7 @@ def _shared_simple_env( top_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1092,7 +1087,7 @@ def neon_env_builder( mock_s3_server: MockS3Server, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, default_broker: NeonBroker, run_id: uuid.UUID, ) -> Iterator[NeonEnvBuilder]: @@ -1747,7 +1742,7 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: str): + def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion): self.log_dir = log_dir self.pg_version = pg_version self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin" @@ -1806,7 +1801,7 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBin: +def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -1894,7 +1889,7 @@ def vanilla_pg( test_output_dir: Path, port_distributor: PortDistributor, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -1939,7 +1934,7 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") def remote_pg( - test_output_dir: Path, pg_distrib_dir: Path, pg_version: str + test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion ) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -2623,7 +2618,7 @@ class Safekeeper: @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int - pg_version: int + pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 flush_lsn: Lsn commit_lsn: Lsn timeline_start_lsn: Lsn @@ -2669,7 +2664,11 @@ class SafekeeperHttpClient(requests.Session): return res_json def timeline_create( - self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + commit_lsn: Lsn, ): body = { "tenant_id": str(tenant_id), diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index cf92aeb6c0..0c4ed60c8d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -9,6 +9,7 @@ import requests from fixtures.log_helper import log from fixtures.metrics import Metrics, parse_metrics +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import Fn @@ -266,19 +267,21 @@ class PageserverHttpClient(requests.Session): def timeline_create( self, + pg_version: PgVersion, tenant_id: TenantId, new_timeline_id: Optional[TimelineId] = None, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", - json={ - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, - "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, - "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, - }, - ) + body: Dict[str, Any] = { + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + } + if pg_version != PgVersion.NOT_SET: + body["pg_version"] = int(pg_version) + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body) self.verbose_error(res) if res.status_code == 409: raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py new file mode 100644 index 0000000000..904a274197 --- /dev/null +++ b/test_runner/fixtures/pg_version.py @@ -0,0 +1,68 @@ +import enum +import os +from typing import Iterator, Optional + +import pytest +from _pytest.config.argparsing import Parser +from pytest import FixtureRequest + +from fixtures.log_helper import log + +""" +This fixture is used to determine which version of Postgres to use for tests. +""" + + +# Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument +# TODO: use enum.StrEnum for Python >= 3.11 +@enum.unique +class PgVersion(str, enum.Enum): + V14 = "14" + V15 = "15" + # Instead of making version an optional parameter in methods, we can use this fake entry + # to explicitly rely on the default server version (could be different from pg_version fixture value) + NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" + + # Make it less confusing in logs + def __repr__(self) -> str: + return f"'{self.value}'" + + @classmethod + def _missing_(cls, value) -> Optional["PgVersion"]: + known_values = {v.value for _, v in cls.__members__.items()} + + # Allow passing version as a string with "v" prefix (e.g. "v14") + if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values: + return cls(value[1:]) + # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15) + elif isinstance(value, int) and str(value)[:2] in known_values: + return cls(str(value)[:2]) + + # Make mypy happy + # See https://github.com/python/mypy/issues/3974 + return None + + +DEFAULT_VERSION: PgVersion = PgVersion.V14 + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--pg-version", + action="store", + type=PgVersion, + help="Postgres version to use for tests", + ) + + +@pytest.fixture(scope="session") +def pg_version(request: FixtureRequest) -> Iterator[PgVersion]: + if v := request.config.getoption("--pg-version"): + version, source = v, "from --pg-version commad-line argument" + elif v := os.environ.get("DEFAULT_PG_VERSION"): + version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable" + else: + version, source = DEFAULT_VERSION, "default verson" + + log.info(f"pg_version is {version} ({source})") + yield version diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 91181943d5..bb4427f2c4 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore deleted file mode 100644 index 8b13789179..0000000000 --- a/test_runner/pg_clients/java/jdbc/.gitignore +++ /dev/null @@ -1 +0,0 @@ - diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 0b7d03e636..74eb9bdc32 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:17 +FROM openjdk:20 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.5.1.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index 3dbb98d6a1..7bba8da06d 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1 +1,2 @@ pg8000==1.29.4 +scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index a0067b183e..30deb3ff20 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "async-trait" -version = "0.1.65" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "095183a3539c7c7649b2beb87c2d3f0591f3a7fed07761cc546d244e27e0238c" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", @@ -21,9 +21,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.13.1" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" [[package]] name = "bitflags" @@ -33,9 +33,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] @@ -76,15 +76,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" dependencies = [ "libc", ] @@ -112,13 +112,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -163,9 +163,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -188,15 +188,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -205,15 +205,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", @@ -222,21 +222,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -252,9 +252,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -262,15 +262,21 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", "wasi", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hmac" version = "0.12.1" @@ -291,12 +297,13 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.5" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ + "hermit-abi", "libc", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -307,15 +314,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "36eb31c1778188ae1e64398743890d0877fef36d11521ac60406b42016e8c2cf" [[package]] name = "lock_api" @@ -389,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "openssl" -version = "0.10.48" +version = "0.10.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "518915b97df115dd36109bfa429a48b8f737bd05508cf9588977b599648926d2" +checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" dependencies = [ "bitflags", "cfg-if", @@ -404,9 +411,9 @@ dependencies = [ [[package]] name = "openssl-macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", @@ -421,11 +428,10 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.83" +version = "0.9.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "666416d899cf077260dac8698d60a60b435a46d57e82acb1be3d0dad87284e5b" +checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" dependencies = [ - "autocfg", "cc", "libc", "pkg-config", @@ -450,7 +456,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "windows-sys 0.45.0", ] @@ -512,9 +518,9 @@ dependencies = [ [[package]] name = "postgres-protocol" -version = "0.6.4" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "878c6cbf956e03af9aa8204b407b9cbf47c072164800aa918c516cd4b056c50c" +checksum = "78b7fa9f396f51dffd61546fd8573ee20592287996568e6175ceb0f8699ad75d" dependencies = [ "base64", "byteorder", @@ -530,9 +536,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73d946ec7d256b04dfadc4e6a3292324e6f417124750fc5c0950f981b703a0f1" +checksum = "f028f05971fe20f512bcc679e2c10227e57809a3af86a7606304435bc8896cd6" dependencies = [ "bytes", "fallible-iterator", @@ -547,18 +553,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -602,6 +608,15 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "rust-neon-example" version = "0.1.0" @@ -614,16 +629,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.9" +version = "0.37.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +checksum = "a0661814f891c57c930a610266415528da53c4933e6dea5fb350cbfe048a9ece" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -706,6 +721,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "socket2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -724,9 +749,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.109" +version = "2.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" dependencies = [ "proc-macro2", "quote", @@ -735,15 +760,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -763,26 +788,25 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.26.0" +version = "1.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64" +checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "pin-project-lite", - "socket2", + "socket2 0.4.9", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", @@ -801,9 +825,9 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29a12c1b3e0704ae7dfc25562629798b29c72e6b1d0a681b6f29ab4ae5e7f7bf" +checksum = "6e89f6234aa8fd43779746012fcf53603cdb91fdd8399aa0de868c2d56b6dde1" dependencies = [ "async-trait", "byteorder", @@ -818,16 +842,16 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2", + "socket2 0.5.2", "tokio", "tokio-util", ] [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" dependencies = [ "bytes", "futures-core", @@ -839,11 +863,10 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "cf9cf6a813d3f40c88b0b6b6f29a5c95c6cdbf97c1f9cc53fb820200f5ad814d" dependencies = [ - "cfg-if", "pin-project-lite", "tracing-core", ] @@ -865,15 +888,15 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "775c11906edafc97bc378816b94585fbd9a054eabaf86fdd0ced94af449efab7" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" @@ -930,13 +953,13 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -945,62 +968,128 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index f1b519bdb2..4675ac8a3f 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -9,8 +9,8 @@ publish = false [dependencies] native-tls = "0.2.11" postgres-native-tls = "0.5.0" -tokio = { version = "1.26", features=["rt", "macros"] } -tokio-postgres = "0.7.7" +tokio = { version = "1.28", features=["rt", "macros"] } +tokio-postgres = "0.7.8" # This is not part of the main 'neon' workspace diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index c3a15b5d85..43fc6f6c92 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.67 +FROM rust:1.69 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index f6a49a2892..9538cf4ed4 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.7 AS build +FROM swift:5.8 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.7 +FROM swift:5.8 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index ba666cba06..48320dd023 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.7 +// swift-tools-version:5.8 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift index e559e9c184..dabca6d431 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift @@ -3,21 +3,22 @@ import Foundation import PostgresClientKit do { - var configuration = PostgresClientKit.ConnectionConfiguration() - let env = ProcessInfo.processInfo.environment - if let host = env["NEON_HOST"] { - configuration.host = host - } - if let database = env["NEON_DATABASE"] { - configuration.database = database - } - if let user = env["NEON_USER"] { - configuration.user = user - } - if let password = env["NEON_PASSWORD"] { - configuration.credential = .cleartextPassword(password: password) - } + + var configuration = PostgresClientKit.ConnectionConfiguration() + let host = env["NEON_HOST"] ?? "" + configuration.host = host + configuration.port = 5432 + configuration.database = env["NEON_DATABASE"] ?? "" + configuration.user = env["NEON_USER"] ?? "" + + // PostgresClientKit uses Kitura/BlueSSLService which doesn't support SNI + // PostgresClientKit doesn't support setting connection options, so we use "Workaround D" + // See https://neon.tech/sni + let password = env["NEON_PASSWORD"] ?? "" + let endpoint_id = host.split(separator: ".")[0] + let workaroundD = "project=\(endpoint_id);\(password)" + configuration.credential = .cleartextPassword(password: workaroundD) let connection = try PostgresClientKit.Connection(configuration: configuration) defer { connection.close() } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 629422d220..61e1d1bba6 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.7 AS build +FROM swift:5.8 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.7 +FROM swift:5.8 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 8246567b24..cc12acda4c 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "7daf026e145de2c07d6e37f4171b1acb4b5f22b1", - "version" : "1.12.1" + "revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937", + "version" : "1.14.1" } }, { @@ -14,8 +14,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "ff3d2212b6b093db7f177d0855adbc4ef9c5f036", - "version" : "1.0.3" + "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", + "version" : "1.1.0" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "45167b8006448c79dda4b7bd604e07a034c15c49", - "version" : "2.48.0" + "revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75", + "version" : "2.51.1" } }, { @@ -68,8 +68,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "4fb7ead803e38949eb1d6fabb849206a72c580f3", - "version" : "2.23.0" + "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", + "version" : "2.24.0" } }, { @@ -77,8 +77,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "c0d9a144cfaec8d3d596aadde3039286a266c15c", - "version" : "1.15.0" + "revision" : "41f4098903878418537020075a4d8a6e20a0b182", + "version" : "1.17.0" } } ], diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index c64013b9ee..ac32b982e2 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.7 +// swift-tools-version:5.8 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.8.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift index 092a0b31f3..ca7e22dab1 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift @@ -14,15 +14,11 @@ await Task { let sslContext = try! NIOSSLContext(configuration: .makeClientConfiguration()) let config = PostgresConnection.Configuration( - connection: .init( - host: env["NEON_HOST"] ?? "", - port: 5432 - ), - authentication: .init( - username: env["NEON_USER"] ?? "", - database: env["NEON_DATABASE"] ?? "", - password: env["NEON_PASSWORD"] ?? "" - ), + host: env["NEON_HOST"] ?? "", + port: 5432, + username: env["NEON_USER"] ?? "", + password: env["NEON_PASSWORD"] ?? "", + database: env["NEON_DATABASE"] ?? "", tls: .require(sslContext) ) @@ -42,7 +38,7 @@ await Task { try await connection.close() // Shutdown the EventLoopGroup, once all connections are closed. - try eventLoopGroup.syncShutdownGracefully() + try await eventLoopGroup.shutdownGracefully() } catch { print(error) } diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 18e3f379da..7c20bac399 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -16,10 +16,9 @@ from fixtures.utils import subprocess_capture "rust/tokio-postgres", "python/asyncpg", "python/pg8000", - pytest.param( - "swift/PostgresClientKitExample", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 - marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"), - ), + # PostgresClientKitExample does not support SNI or connection options, so it uses workaround D (https://neon.tech/sni) + # See example itself: test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift + "swift/PostgresClientKitExample", "swift/PostgresNIOExample", "typescript/postgresql-client", "typescript/serverless-driver", diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index a5ad832a5c..07e98c586b 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:18 +FROM node:20 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 5586fe883e..e4dfd1dd9d 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.2" + "postgresql-client": "2.5.5" } }, "node_modules/debug": { @@ -63,18 +63,18 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.2.tgz", - "integrity": "sha512-BaVDEmPlZllcmXBbw48194a6sB1YEe+ACX8c3SfgpGeW9+xJ5vShQ/ruNZtI+nuPW95BjL1WQGaxy+SfxSQgUQ==", + "version": "2.5.5", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz", + "integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==", "dependencies": { "debug": "^4.3.4", - "doublylinked": "^2.5.1", - "lightning-pool": "^4.2.0", + "doublylinked": "^2.5.2", + "lightning-pool": "^4.2.1", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.5.0", - "putil-merge": "^3.9.0", - "putil-promisify": "^1.8.5", - "putil-varhelpers": "^1.6.4" + "power-tasks": "^1.6.4", + "putil-merge": "^3.10.3", + "putil-promisify": "^1.10.0", + "putil-varhelpers": "^1.6.5" }, "engines": { "node": ">=14.0", @@ -82,9 +82,9 @@ } }, "node_modules/power-tasks": { - "version": "1.6.3", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.3.tgz", - "integrity": "sha512-nBqzjbiCxvftEKsJtbEz5ZVKVl6RdwA5I7Ts3Z7DCe3lkvFsv9d8J4qp+b9GbdddsfV1KyIPSqPyLWq2YJQh6g==", + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz", + "integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==", "dependencies": { "debug": "^4.3.4", "doublylinked": "^2.5.2", @@ -96,9 +96,9 @@ } }, "node_modules/putil-merge": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.1.tgz", - "integrity": "sha512-t3cLn14qccFvmb4bYQfNEHoisab//bTjM3lp56Ks8rOsjWF2ssf7Vapg9Lt89GlEawyNdeu+xj5GSrsFqNoCDQ==", + "version": "3.10.3", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", + "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", "engines": { "node": ">= 10.0" } @@ -112,9 +112,9 @@ } }, "node_modules/putil-varhelpers": { - "version": "1.6.4", - "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", - "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==", + "version": "1.6.5", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.5.tgz", + "integrity": "sha512-kyu+lE5xkc65ScgaIi6rNONLXeS7jGBxl1I0rzHVsFGAAQ45D/VkuEev+t09PFB943F+CqdWFLczH6ePk5TPAA==", "engines": { "node": ">= 6.0" } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 80540dec22..9eaa13437a 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.2" + "postgresql-client": "2.5.5" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/index.js b/test_runner/pg_clients/typescript/serverless-driver/index.js index 91db4037e1..ad9a4c3000 100755 --- a/test_runner/pg_clients/typescript/serverless-driver/index.js +++ b/test_runner/pg_clients/typescript/serverless-driver/index.js @@ -1,8 +1,11 @@ #! /usr/bin/env node -import { Client } from '@neondatabase/serverless' +import ws from 'ws' +import { neonConfig, Client } from '@neondatabase/serverless' (async () => { + neonConfig.webSocketConstructor = ws + const client = new Client({ host: process.env.NEON_HOST, database: process.env.NEON_DATABASE, diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index c4d03bffcb..0fb84cf5b7 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -1,18 +1,99 @@ { "name": "serverless-driver", - "lockfileVersion": 2, + "lockfileVersion": 3, "requires": true, "packages": { "": { "dependencies": { - "@neondatabase/serverless": "^0.2.8", - "ws": "^8.13.0" + "@neondatabase/serverless": "0.4.3", + "ws": "8.13.0" } }, "node_modules/@neondatabase/serverless": { - "version": "0.2.8", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", - "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.3.tgz", + "integrity": "sha512-U8tpuF5f0R5WRsciR7iaJ5S2h54DWa6Z6CEW+J4KgwyvRN3q3qDz0MibdfFXU0WqnRoi/9RSf/2XN4TfeaOCbQ==", + "dependencies": { + "@types/pg": "^8.6.6" + } + }, + "node_modules/@types/node": { + "version": "18.16.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", + "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + }, + "node_modules/@types/pg": { + "version": "8.6.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", + "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "dependencies": { + "@types/node": "*", + "pg-protocol": "*", + "pg-types": "^2.2.0" + } + }, + "node_modules/pg-int8": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", + "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/pg-protocol": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", + "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" + }, + "node_modules/pg-types": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", + "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "dependencies": { + "pg-int8": "1.0.1", + "postgres-array": "~2.0.0", + "postgres-bytea": "~1.0.0", + "postgres-date": "~1.0.4", + "postgres-interval": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-array": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", + "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-bytea": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", + "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-date": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", + "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-interval": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", + "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", + "dependencies": { + "xtend": "^4.0.0" + }, + "engines": { + "node": ">=0.10.0" + } }, "node_modules/ws": { "version": "8.13.0", @@ -33,19 +114,14 @@ "optional": true } } - } - }, - "dependencies": { - "@neondatabase/serverless": { - "version": "0.2.8", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", - "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" }, - "ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", - "requires": {} + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "engines": { + "node": ">=0.4" + } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 2b01a22d3e..71ba181afc 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "^0.2.8", - "ws": "^8.13.0" + "@neondatabase/serverless": "0.4.3", + "ws": "8.13.0" } } diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 3305869dce..3e4a0bfbbb 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -31,11 +31,15 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): # tenant can create branches tenant_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # console can create branches for tenant pageserver_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # fail to create branch using token with different tenant_id @@ -43,7 +47,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" ): invalid_tenant_http_client.timeline_create( - tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id, ) # create tenant using management token diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e262202a73..ff61af8fd3 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -16,6 +16,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn from pytest import FixtureRequest @@ -50,7 +51,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. # # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. - neon_env_builder.pg_version = "14" + neon_env_builder.pg_version = PgVersion.V14 neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_local_fs_remote_storage() neon_env_builder.preserve_database_files = True @@ -98,7 +99,7 @@ def test_backward_compatibility( test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, request: FixtureRequest, ): """ @@ -153,7 +154,7 @@ def test_backward_compatibility( def test_forward_compatibility( test_output_dir: Path, port_distributor: PortDistributor, - pg_version: str, + pg_version: PgVersion, request: FixtureRequest, neon_binpath: Path, ): @@ -345,7 +346,7 @@ def check_neon_works( neon_target_binpath: Path, neon_current_binpath: Path, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, port_distributor: PortDistributor, test_output_dir: Path, pg_bin: PgBin, @@ -404,7 +405,7 @@ def check_neon_works( shutil.rmtree(repo_dir / "local_fs_remote_storage") pageserver_http.timeline_delete(tenant_id, timeline_id) - pageserver_http.timeline_create(tenant_id, timeline_id) + pageserver_http.timeline_create(pg_version, tenant_id, timeline_id) pg_bin.run( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index df542fb84a..1231188896 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -24,13 +24,6 @@ from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response - -@pytest.fixture(scope="session") -def httpserver_listen_address(port_distributor: PortDistributor): - port = port_distributor.get_port() - return ("localhost", port) - - # ============================================================================== # Storage metrics tests # ============================================================================== diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index e86cd18f58..28732872df 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -8,6 +8,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until @@ -61,7 +62,7 @@ def test_pageserver_init_node_id( assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: PageserverHttpClient, initial_tenant: TenantId): +def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there @@ -77,7 +78,11 @@ def check_client(client: PageserverHttpClient, initial_tenant: TenantId): # create timeline timeline_id = TimelineId.generate() - client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + client.timeline_create( + pg_version=pg_version, + tenant_id=tenant_id, + new_timeline_id=timeline_id, + ) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 @@ -174,7 +179,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): def test_pageserver_http_api_client(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: - check_client(client, env.initial_tenant) + check_client(env.pg_version, client, env.initial_tenant) def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): @@ -184,4 +189,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde pageserver_token = env.auth_keys.generate_pageserver_token() with env.pageserver.http_client(auth_token=pageserver_token) as client: - check_client(client, env.initial_tenant) + check_client(env.pg_version, client, env.initial_tenant) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py new file mode 100644 index 0000000000..c7083d92be --- /dev/null +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -0,0 +1,179 @@ +import time +from dataclasses import dataclass +from typing import List, Set, Tuple + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + RemoteStorageKind, + last_flush_lsn_upload, +) +from fixtures.pageserver.http import LayerMapInfo +from fixtures.types import TimelineId +from pytest_httpserver import HTTPServer + +# NB: basic config change tests are in test_tenant_conf.py + + +def test_threshold_based_eviction( + request, + httpserver: HTTPServer, + httpserver_listen_address, + pg_bin: PgBin, + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + + # Start with metrics collection enabled, so that the eviction task + # imitates its accesses. We'll use a non-existent endpoint to make it fail. + # The synthetic size calculation will run regardless. + host, port = httpserver_listen_address + neon_env_builder.pageserver_config_override = f""" + metric_collection_interval="1s" + synthetic_size_calculation_interval="2s" + metric_collection_endpoint="http://{host}:{port}/nonexistent" + """ + metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*" + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(metrics_refused_log_line) + + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline + assert isinstance(timeline_id, TimelineId) + + ps_http = env.pageserver.http_client() + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "NoEviction" + } + + eviction_threshold = 5 + eviction_period = 1 + ps_http.set_tenant_config( + tenant_id, + { + "eviction_policy": { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + }, + }, + ) + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + } + + # restart because changing tenant config is not instant + env.pageserver.stop() + env.pageserver.start() + + assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "threshold": f"{eviction_threshold}s", + "period": f"{eviction_period}s", + } + + # create a bunch of L1s, only the least of which will need to be resident + compaction_threshold = 3 # create L1 layers quickly + ps_http.patch_tenant_config_client_side( + tenant_id, + inserts={ + # Disable gc and compaction to avoid on-demand downloads from their side. + # The only on-demand downloads should be from the eviction tasks's "imitate access" functions. + "gc_period": "0s", + "compaction_period": "0s", + # low checkpoint_distance so that pgbench creates many layers + "checkpoint_distance": 1024**2, + # Low compaction target size to create many L1's with tight key ranges. + # This is so that the "imitate access" don't download all the layers. + "compaction_target_size": 1 * 1024**2, # all keys into one L1 + # Turn L0's into L1's fast. + "compaction_threshold": compaction_threshold, + # Prevent compaction from collapsing L1 delta layers into image layers. We want many layers here. + "image_creation_threshold": 100, + # Much larger so that synthetic size caluclation worker, which is part of metric collection, + # computes logical size for initdb_lsn every time, instead of some moving lsn as we insert data. + # This makes the set of downloaded layers predictable, + # thereby allowing the residence statuses to stabilize below. + "gc_horizon": 1024**4, + }, + ) + + # create a bunch of layers + with env.endpoints.create_start("main", tenant_id=tenant_id) as pg: + pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()]) + last_flush_lsn_upload(env, pg, tenant_id, timeline_id) + # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint + for sk in env.safekeepers: + sk.stop() + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # wait for evictions and assert that they stabilize + @dataclass + class ByLocalAndRemote: + remote_layers: Set[str] + local_layers: Set[str] + + class MapInfoProjection: + def __init__(self, info: LayerMapInfo): + self.info = info + + def by_local_and_remote(self) -> ByLocalAndRemote: + return ByLocalAndRemote( + remote_layers={ + layer.layer_file_name for layer in self.info.historic_layers if layer.remote + }, + local_layers={ + layer.layer_file_name for layer in self.info.historic_layers if not layer.remote + }, + ) + + def __eq__(self, other): + if not isinstance(other, MapInfoProjection): + return False + return self.by_local_and_remote() == other.by_local_and_remote() + + def __repr__(self) -> str: + out = ["MapInfoProjection:"] + for layer in sorted(self.info.historic_layers, key=lambda layer: layer.layer_file_name): + remote = "R" if layer.remote else "L" + out += [f" {remote} {layer.layer_file_name}"] + return "\n".join(out) + + observation_window = 8 * eviction_threshold + consider_stable_when_no_change_for_seconds = 3 * eviction_threshold + poll_interval = eviction_threshold / 3 + started_waiting_at = time.time() + map_info_changes: List[Tuple[float, MapInfoProjection]] = [] + while time.time() - started_waiting_at < observation_window: + current = ( + time.time(), + MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + ) + last = map_info_changes[-1] if map_info_changes else (0, None) + if last[1] is None or current[1] != last[1]: + map_info_changes.append(current) + log.info("change in layer map\n before: %s\n after: %s", last, current) + else: + stable_for = current[0] - last[0] + log.info("residencies stable for %s", stable_for) + if stable_for > consider_stable_when_no_change_for_seconds: + break + time.sleep(poll_interval) + + log.info("len(map_info_changes)=%s", len(map_info_changes)) + + # TODO: can we be more precise here? E.g., require we're stable _within_ X*threshold, + # instead of what we do here, i.e., stable _for at least_ X*threshold toward the end of the observation window + assert ( + stable_for > consider_stable_when_no_change_for_seconds + ), "layer residencies did not become stable within the observation window" + + post = map_info_changes[-1][1].by_local_and_remote() + assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" + assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" + + assert env.pageserver.log_contains( + metrics_refused_log_line + ), "ensure the metrics collection worker ran" diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index db278d5646..1460172afe 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -27,6 +27,7 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, wait_until_tenant_active, ) +from fixtures.pg_version import PgVersion from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until @@ -489,7 +490,7 @@ def test_timeline_size_metrics( test_output_dir: Path, port_distributor: PortDistributor, pg_distrib_dir: Path, - pg_version: str, + pg_version: PgVersion, ): env = neon_simple_env pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index fed5f325ca..2a4141ed30 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -32,6 +32,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background @@ -582,7 +583,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re shutil.copy(f_partial_saved, f_partial_path) # recreate timeline on pageserver from scratch - ps_cli.timeline_create(tenant_id, timeline_id) + ps_cli.timeline_create( + pg_version=PgVersion(pg_version), + tenant_id=tenant_id, + new_timeline_id=timeline_id, + ) wait_lsn_timeout = 60 * 3 started_at = time.time() diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py new file mode 100644 index 0000000000..8e4e154be1 --- /dev/null +++ b/test_runner/regress/test_wal_receiver.py @@ -0,0 +1,115 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.types import Lsn, TenantId + + +# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. +# Ensures that walreceiver does not run without any data inserted and only starts after the insertion. +def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): + # Trigger WAL wait timeout faster + neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" + env = neon_env_builder.init_start() + env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" + env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + assert ( + "WalReceiver status: Not active" in exception_string + ), "Walreceiver should not be active before any data writes" + + insert_test_elements(env, tenant_id, start=0, count=1_000) + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + assert ( + "WalReceiver status: Not active" not in exception_string + ), "Should not be inactive anymore after INSERTs are made" + assert "WalReceiver status" in exception_string, "But still should have some other status" + + +# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout. +# Kills one of the safekeepers and ensures that only the active ones are printed in the state. +def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): + # Trigger WAL wait timeout faster + neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers + neon_env_builder.safekeepers_id_start = 12345 + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + elements_to_insert = 1_000_000 + expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" + env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + + insert_test_elements(env, tenant_id, start=0, count=elements_to_insert) + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + + for safekeeper in env.safekeepers: + assert ( + str(safekeeper.id) in exception_string + ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" + + stopped_safekeeper = env.safekeepers[-1] + stopped_safekeeper_id = stopped_safekeeper.id + log.info(f"Stopping safekeeper {stopped_safekeeper.id}") + stopped_safekeeper.stop() + + # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats. + insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert) + + try: + trigger_wait_lsn_timeout(env, tenant_id) + except Exception as e: + exception_string = str(e) + assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" + + for safekeeper in env.safekeepers: + if safekeeper.id == stopped_safekeeper_id: + assert ( + str(safekeeper.id) not in exception_string + ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + else: + assert ( + str(safekeeper.id) in exception_string + ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" + + +def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): + first_element_id = start + last_element_id = first_element_id + count + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + cur.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({first_element_id},{last_element_id}) as i" + ) + + +future_lsn = Lsn("0/FFFFFFFF") + + +def trigger_wait_lsn_timeout(env: NeonEnv, tenant_id: TenantId): + with env.endpoints.create_start( + "main", + tenant_id=tenant_id, + lsn=future_lsn, + ) as endpoint: + with endpoint.cursor() as cur: + cur.execute("SELECT 1")