//! Functions for handling page server configuration options //! //! Configuration options can be set in the pageserver.toml configuration //! file, or on the command line. //! See also `settings.md` for better description on every parameter. use anyhow::{bail, ensure, Context}; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::{ config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, shard::TenantShardId, }; use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; use storage_broker::Uri; use utils::logging::SecretString; use utils::postgres_client::PostgresClientProtocol; use once_cell::sync::OnceCell; use reqwest::Url; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; use utils::{ id::{NodeId, TimelineId}, logging::LogFormat, }; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::virtual_file; use crate::virtual_file::io_engine; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; /// Global state of pageserver. /// /// It's mostly immutable configuration, but some semaphores and the /// like crept in over time and the name stuck. /// /// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] /// and passing that to [`PageServerConf::parse_and_validate`]. /// /// # Adding a New Field /// /// 1. Add the field to `pageserver_api::config::ConfigToml`. /// 2. Fix compiler errors (exhaustive destructuring will guide you). /// /// For fields that require additional validation or filling in of defaults at runtime, /// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, /// Current availability zone. Used for traffic metrics. pub availability_zone: Option, // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. pub wal_redo_timeout: Duration, pub superuser: String, pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, // Repository directory, relative to current working directory. // Normally, the page server changes the current working directory // to the repository, and 'workdir' is always '.'. But we don't do // that during unit testing, because the current directory is global // to the process but different unit tests work on different // repositories. pub workdir: Utf8PathBuf, pub pg_distrib_dir: Utf8PathBuf, // Authentication /// authentication method for the HTTP mgmt API pub http_auth_type: AuthType, /// authentication method for libpq connections from compute pub pg_auth_type: AuthType, /// Path to a file or directory containing public key(s) for verifying JWT tokens. /// Used for both mgmt and compute auth, if enabled. pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, pub default_tenant_conf: crate::tenant::config::TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, pub broker_keepalive_interval: Duration, pub log_format: LogFormat, /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. /// /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. /// See the comment in `eviction_task` for details. /// /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: Option, pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, /// How long will background tasks be delayed at most after initial load of tenants. /// /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works /// as we now isolate initial loading, initial logical size calculation and background tasks. /// Smaller nodes will have background tasks "not running" for this long unless every timeline /// has it's initial logical size calculated. Not running background tasks for some seconds is /// not terrible. pub background_task_maximum_delay: Duration, pub control_plane_api: Option, /// JWT token for use with the control plane API. pub control_plane_api_token: Option, pub import_pgdata_upcall_api: Option, pub import_pgdata_upcall_api_token: Option, pub import_pgdata_aws_endpoint_url: Option, /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize /// heatmap uploads vs. other remote storage operations. pub heatmap_upload_concurrency: usize, /// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly /// deprioritises secondary downloads vs. remote storage operations for attached tenants. pub secondary_download_concurrency: usize, /// Maximum number of WAL records to be ingested and committed at the same time pub ingest_batch_size: u64, pub virtual_file_io_engine: virtual_file::IoEngineKind, pub max_vectored_read_bytes: MaxVectoredReadBytes, pub image_compression: ImageCompressionAlgorithm, /// Whether to offload archived timelines automatically pub timeline_offloading: bool, /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: crate::l0_flush::L0FlushConfig, /// Direct IO settings pub virtual_file_io_mode: virtual_file::IoMode, /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, } /// Token for authentication to safekeepers /// /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in /// the future, more tokens and auth may arrive for storage broker, completely changing the logic. /// Hence, we resort to a global variable for now instead of passing the token from the /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); impl PageServerConf { // // Repository paths, relative to workdir. // pub fn tenants_path(&self) -> Utf8PathBuf { self.workdir.join(TENANTS_SEGMENT_NAME) } pub fn deletion_prefix(&self) -> Utf8PathBuf { self.workdir.join("deletion") } pub fn metadata_path(&self) -> Utf8PathBuf { self.workdir.join("metadata.json") } pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. const VERSION: u8 = 1; self.deletion_prefix() .join(format!("{sequence:016x}-{VERSION:02x}.list")) } pub fn deletion_header_path(&self) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. const VERSION: u8 = 1; self.deletion_prefix().join(format!("header-{VERSION:02x}")) } pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenants_path().join(tenant_shard_id.to_string()) } /// Points to a place in pageserver's local directory, /// where certain tenant's LocationConf be stored. pub(crate) fn tenant_location_config_path( &self, tenant_shard_id: &TenantShardId, ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_HEATMAP_BASENAME) } pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TIMELINES_SEGMENT_NAME) } pub fn timeline_path( &self, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Utf8PathBuf { self.timelines_path(tenant_shard_id) .join(timeline_id.to_string()) } /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) } // // Postgres distribution paths // pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); #[allow(clippy::manual_range_patterns)] match pg_version { 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate( id: NodeId, config_toml: pageserver_api::config::ConfigToml, workdir: &Utf8Path, ) -> anyhow::Result { let pageserver_api::config::ConfigToml { listen_pg_addr, listen_http_addr, availability_zone, wait_lsn_timeout, wal_redo_timeout, superuser, locale, page_cache_size, max_file_descriptors, pg_distrib_dir, http_auth_type, pg_auth_type, auth_validation_public_key_path, remote_storage, broker_endpoint, broker_keepalive_interval, log_format, metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api, control_plane_api_token, control_plane_emergency_mode, import_pgdata_upcall_api, import_pgdata_upcall_api_token, import_pgdata_aws_endpoint_url, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, l0_flush, virtual_file_io_mode, concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, tenant_config, no_sync, wal_receiver_protocol, page_service_pipelining, } = config_toml; let mut conf = PageServerConf { // ------------------------------------------------------------ // fields that are already fully validated by the ConfigToml Deserialize impl // ------------------------------------------------------------ listen_pg_addr, listen_http_addr, availability_zone, wait_lsn_timeout, wal_redo_timeout, superuser, locale, page_cache_size, max_file_descriptors, http_auth_type, pg_auth_type, auth_validation_public_key_path, remote_storage_config: remote_storage, broker_endpoint, broker_keepalive_interval, log_format, metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, control_plane_api, control_plane_emergency_mode, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, wal_receiver_protocol, page_service_pipelining, // ------------------------------------------------------------ // fields that require additional validation or custom handling // ------------------------------------------------------------ workdir: workdir.to_owned(), pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { std::env::current_dir() .expect("current_dir() failed") .try_into() .expect("current_dir() is not a valid Utf8Path") }), control_plane_api_token: control_plane_api_token.map(SecretString::from), id, default_tenant_conf: tenant_config, concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( concurrent_tenant_size_logical_size_queries, ), eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( // re-use `concurrent_tenant_size_logical_size_queries` concurrent_tenant_size_logical_size_queries, ), virtual_file_io_engine: match virtual_file_io_engine { Some(v) => v, None => match crate::virtual_file::io_engine_feature_test() .context("auto-detect virtual_file_io_engine")? { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); engine } }, }, l0_flush: l0_flush .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), }; // ------------------------------------------------------------ // custom validation code that covers more than one field in isolation // ------------------------------------------------------------ if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); ensure!( auth_validation_public_key_path.exists(), format!( "Can't find auth_validation_public_key at '{auth_validation_public_key_path}'", ) ); } IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) .map_err(anyhow::Error::msg) .with_context(|| { format!( "effective checkpoint distance is unsupported: {}", conf.default_tenant_conf.checkpoint_distance ) })?; Ok(conf) } #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); let config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), synthetic_size_calculation_interval: Duration::from_secs(60), background_task_maximum_delay: Duration::ZERO, ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } #[derive(serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty /// semaphore cannot be distinguished, leading any feature using these to await forever (or until /// new permits are added). #[derive(Debug, Clone)] pub struct ConfigurableSemaphore { initial_permits: NonZeroUsize, inner: std::sync::Arc, } impl ConfigurableSemaphore { /// Initializse using a non-zero amount of permits. /// /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will /// behave like [`futures::future::pending`], just waiting until new permits are added. /// /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs pub fn new(initial_permits: NonZeroUsize) -> Self { ConfigurableSemaphore { initial_permits, inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), } } /// Returns the configured amount of permits. pub fn initial_permits(&self) -> NonZeroUsize { self.initial_permits } } impl PartialEq for ConfigurableSemaphore { fn eq(&self, other: &Self) -> bool { // the number of permits can be increased at runtime, so we cannot really fulfill the // PartialEq value equality otherwise self.initial_permits == other.initial_permits } } impl Eq for ConfigurableSemaphore {} impl ConfigurableSemaphore { pub fn inner(&self) -> &std::sync::Arc { &self.inner } } #[cfg(test)] mod tests { use camino::Utf8PathBuf; use utils::id::NodeId; use super::PageServerConf; #[test] fn test_empty_config_toml_is_valid() { // we use Default impl of everything in this situation let input = r#" "#; let config_toml = toml_edit::de::from_str::(input) .expect("empty config is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } /// If there's a typo in the pageserver config, we'd rather catch that typo /// and fail pageserver startup than silently ignoring the typo, leaving whoever /// made it in the believe that their config change is effective. /// /// The default in serde is to allow unknown fields, so, we rely /// on developer+review discipline to add `deny_unknown_fields` when adding /// new structs to the config, and these tests here as a regression test. /// /// The alternative to all of this would be to allow unknown fields in the config. /// To catch them, we could have a config check tool or mgmt API endpoint that /// compares the effective config with the TOML on disk and makes sure that /// the on-disk TOML is a strict subset of the effective config. mod unknown_fields_handling { macro_rules! test { ($short_name:ident, $input:expr) => { #[test] fn $short_name() { let input = $input; let err = toml_edit::de::from_str::(&input) .expect_err("some_invalid_field is an invalid field"); dbg!(&err); assert!(err.to_string().contains("some_invalid_field")); } }; } use indoc::indoc; test!( toplevel, indoc! {r#" some_invalid_field = 23 "#} ); test!( toplevel_nested, indoc! {r#" [some_invalid_field] foo = 23 "#} ); test!( disk_usage_based_eviction, indoc! {r#" [disk_usage_based_eviction] some_invalid_field = 23 "#} ); test!( tenant_config, indoc! {r#" [tenant_config] some_invalid_field = 23 "#} ); test!( l0_flush, indoc! {r#" [l0_flush] mode = "direct" some_invalid_field = 23 "#} ); // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 // test!( // remote_storage_config, // indoc! {r#" // [remote_storage_config] // local_path = "/nonexistent" // some_invalid_field = 23 // "#} // ); } }