From 850421ec06dae634b762af0d4a38194eba103884 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 5 Sep 2024 14:59:49 +0200 Subject: [PATCH] refactor(pageserver): rely on serde derive for toml deserialization (#7656) This PR simplifies the pageserver configuration parsing as follows: * introduce the `pageserver_api::config::ConfigToml` type * implement `Default` for `ConfigToml` * use serde derive to do the brain-dead leg-work of processing the toml document * use `serde(default)` to fill in default values * in `pageserver` crate: * use `toml_edit` to deserialize the pageserver.toml string into a `ConfigToml` * `PageServerConfig::parse_and_validate` then * consumes the `ConfigToml` * destructures it exhaustively into its constituent fields * constructs the `PageServerConfig` The rules are: * in `ConfigToml`, use `deny_unknown_fields` everywhere * static default values go in `pageserver_api` * if there cannot be a static default value (e.g. which default IO engine to use, because it depends on the runtime), make the field in `ConfigToml` an `Option` * if runtime-augmentation of a value is needed, do that in `parse_and_validate` * a good example is `virtual_file_io_engine` or `l0_flush`, both of which need to execute code to determine the effective value in `PageServerConf` The benefits: * massive amount of brain-dead repetitive code can be deleted * "unused variable" compile-time errors when removing a config value, due to the exhaustive destructuring in `parse_and_validate` * compile-time errors guide you when adding a new config field Drawbacks: * serde derive is sometimes a bit too magical * `deny_unknown_fields` is easy to miss Future Work / Benefits: * make `neon_local` use `pageserver_api` to construct `ConfigToml` and write it to `pageserver.toml` * This provides more type safety / coompile-time errors than the current approach. ### Refs Fixes #3682 ### Future Work * `remote_storage` deser doesn't reject unknown fields https://github.com/neondatabase/neon/issues/8915 * clean up `libs/pageserver_api/src/config.rs` further * break up into multiple files, at least for tenant config * move `models` as appropriate / refine distinction between config and API models / be explicit about when it's the same * use `pub(crate)` visibility on `mod defaults` to detect stale values --- Cargo.lock | 13 + Cargo.toml | 1 + libs/pageserver_api/Cargo.toml | 10 + libs/pageserver_api/src/config.rs | 527 +++++- libs/pageserver_api/src/models.rs | 71 +- libs/remote_storage/src/config.rs | 25 + libs/utils/src/logging.rs | 12 +- pageserver/Cargo.toml | 3 +- pageserver/benches/bench_ingest.rs | 4 +- pageserver/ctl/src/layer_map_analyzer.rs | 3 +- pageserver/ctl/src/layers.rs | 3 +- pageserver/ctl/src/main.rs | 3 +- pageserver/src/bin/pageserver.rs | 31 +- pageserver/src/config.rs | 1539 +++-------------- pageserver/src/disk_usage_eviction_task.rs | 48 +- pageserver/src/http/routes.rs | 4 +- pageserver/src/l0_flush.rs | 14 +- pageserver/src/statvfs.rs | 28 +- pageserver/src/tenant/config.rs | 196 +-- .../src/tenant/storage_layer/delta_layer.rs | 3 +- .../src/tenant/storage_layer/image_layer.rs | 4 +- .../tenant/storage_layer/inmemory_layer.rs | 2 +- pageserver/src/tenant/tasks.rs | 9 +- pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/tenant/timeline/compaction.rs | 42 +- pageserver/src/tenant/vectored_blob_io.rs | 4 - pageserver/src/virtual_file.rs | 2 +- pageserver/src/virtual_file/io_engine.rs | 11 +- test_runner/fixtures/neon_fixtures.py | 22 +- .../regress/test_pageserver_generations.py | 15 +- test_runner/regress/test_timeline_size.py | 6 +- 31 files changed, 1001 insertions(+), 1656 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5af3ef3804..91917d5351 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2727,6 +2727,12 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "infer" version = "0.2.3" @@ -3701,6 +3707,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", + "indoc", "itertools 0.10.5", "md5", "metrics", @@ -3766,6 +3773,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "camino", "chrono", "const_format", "enum-map", @@ -3773,11 +3781,16 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", + "nix 0.27.1", + "postgres_backend", "postgres_ffi", "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", + "storage_broker", "strum", "strum_macros", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index fa949f9757..4fea3e8d80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,6 +103,7 @@ humantime-serde = "1.1.1" hyper = "0.14" tokio-tungstenite = "0.20.0" indexmap = "2" +indoc = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index cb28359ac3..8710904cec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +# See pageserver/Cargo.toml +testing = ["dep:nix"] + [dependencies] serde.workspace = true serde_with.workspace = true @@ -23,6 +27,12 @@ thiserror.workspace = true humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true +storage_broker.workspace = true +camino = {workspace = true, features = ["serde1"]} +remote_storage.workspace = true +postgres_backend.workspace = true +nix = {workspace = true, optional = true} +reqwest.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index d996a62349..b2662c562a 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -1,15 +1,28 @@ -use std::collections::HashMap; - -use const_format::formatcp; +use camino::Utf8PathBuf; #[cfg(test)] mod tests; +use const_format::formatcp; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use postgres_backend::AuthType; +use remote_storage::RemoteStorageConfig; +use serde_with::serde_as; +use std::{ + collections::HashMap, + num::{NonZeroU64, NonZeroUsize}, + str::FromStr, + time::Duration, +}; +use utils::logging::LogFormat; + +use crate::models::ImageCompressionAlgorithm; +use crate::models::LsnLease; + // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver // itself, it is only used for registering the pageserver with the control @@ -29,3 +42,511 @@ pub struct NodeMetadata { #[serde(flatten)] pub other: HashMap, } + +/// `pageserver.toml` +/// +/// We use serde derive with `#[serde(default)]` to generate a deserializer +/// that fills in the default values for each config field. +/// +/// If there cannot be a static default value because we need to make runtime +/// checks to determine the default, make it an `Option` (which defaults to None). +/// The runtime check should be done in the consuming crate, i.e., `pageserver`. +#[serde_as] +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ConfigToml { + // types mapped 1:1 into the runtime PageServerConfig type + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub availability_zone: Option, + #[serde(with = "humantime_serde")] + pub wait_lsn_timeout: Duration, + #[serde(with = "humantime_serde")] + pub wal_redo_timeout: Duration, + pub superuser: String, + pub page_cache_size: usize, + pub max_file_descriptors: usize, + pub pg_distrib_dir: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub http_auth_type: AuthType, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub pg_auth_type: AuthType, + pub auth_validation_public_key_path: Option, + pub remote_storage: Option, + pub tenant_config: TenantConfigToml, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub broker_endpoint: storage_broker::Uri, + #[serde(with = "humantime_serde")] + pub broker_keepalive_interval: Duration, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub log_format: LogFormat, + pub concurrent_tenant_warmup: NonZeroUsize, + pub concurrent_tenant_size_logical_size_queries: NonZeroUsize, + #[serde(with = "humantime_serde")] + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, + #[serde(with = "humantime_serde")] + pub synthetic_size_calculation_interval: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, + pub ondemand_download_behavior_treat_error_as_warn: bool, + #[serde(with = "humantime_serde")] + pub background_task_maximum_delay: Duration, + pub control_plane_api: Option, + pub control_plane_api_token: Option, + pub control_plane_emergency_mode: bool, + pub heatmap_upload_concurrency: usize, + pub secondary_download_concurrency: usize, + pub virtual_file_io_engine: Option, + pub ingest_batch_size: u64, + pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub image_compression: ImageCompressionAlgorithm, + pub ephemeral_bytes_per_memory_kb: usize, + pub l0_flush: Option, + pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, + pub io_buffer_alignment: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: utils::serde_percent::Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +pub mod statvfs { + pub mod mock { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + #[cfg(feature = "testing")] + Failure { mocked_error: MockedError }, + } + + #[cfg(feature = "testing")] + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + #[cfg(feature = "testing")] + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + RelativeAccessed { + highest_layer_count_loses_first: bool, + }, +} + +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum CompactL0Phase1ValueAccess { + /// The old way. + PageCachedBlobIo, + /// The new way. + StreamingKmerge { + /// If set, we run both the old way and the new way, validate that + /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), + /// and if the validation fails, + /// - in tests: fail them with a panic or + /// - in prod, log a rate-limited warning and use the old way's results. + /// + /// If not set, we only run the new way and trust its results. + validate: Option, + }, +} + +/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum CompactL0BypassPageCacheValidation { + /// Validate that the series of (key, lsn) pairs are the same. + KeyLsn, + /// Validate that the entire output of old and new way is identical. + KeyLsnValue, +} + +impl Default for CompactL0Phase1ValueAccess { + fn default() -> Self { + CompactL0Phase1ValueAccess::StreamingKmerge { + // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident + validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), + } + } +} + +/// A tenant's calcuated configuration, which is the result of merging a +/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. +/// +/// For storing and transmitting individual tenant's configuration, see +/// TenantConfOpt. +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields, default)] +pub struct TenantConfigToml { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Duration, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + // Duration::ZERO means automatic compaction is disabled. + #[serde(with = "humantime_serde")] + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + // Duration::ZERO means automatic GC is disabled + #[serde(with = "humantime_serde")] + pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] + pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, + pub eviction_policy: crate::models::EvictionPolicy, + pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] + pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: crate::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. + pub switch_aux_file_policy: crate::models::AuxFilePolicy, + + /// The length for an explicit LSN lease request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Duration, + + /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Duration, +} + +pub mod defaults { + use crate::models::ImageCompressionAlgorithm; + + pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; + pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; + + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + + pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; + pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1; + + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; + pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::Zstd { level: Some(1) }; + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; + + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + + pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; +} + +impl Default for ConfigToml { + fn default() -> Self { + use defaults::*; + + Self { + listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: (None), + wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: (DEFAULT_SUPERUSER.to_string()), + page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), + pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() + http_auth_type: (AuthType::Trust), + pg_auth_type: (AuthType::Trust), + auth_validation_public_key_path: (None), + remote_storage: None, + broker_endpoint: (storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: (humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), + log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), + concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(), + metric_collection_interval: (humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + synthetic_size_calculation_interval: (humantime::parse_duration( + DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, + ) + .expect("cannot parse default synthetic size calculation interval")), + metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT), + + metric_collection_bucket: (None), + + disk_usage_based_eviction: (None), + + test_remote_failures: (0), + + ondemand_download_behavior_treat_error_as_warn: (false), + + background_task_maximum_delay: (humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), + + control_plane_api: (None), + control_plane_api_token: (None), + control_plane_emergency_mode: (false), + + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: None, + + max_vectored_read_bytes: (MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + image_compression: (DEFAULT_IMAGE_COMPRESSION), + ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: None, + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), + + io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, + + tenant_config: TenantConfigToml::default(), + } + } +} + +pub mod tenant_conf_defaults { + + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = + crate::models::CompactionAlgorithm::Legacy; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + + // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. + // If there's a need to decrease this value, first make sure that GC + // doesn't hold a layer map write lock for non-trivial operations. + // Relevant: https://github.com/neondatabase/neon/issues/3394 + pub const DEFAULT_GC_PERIOD: &str = "1 hr"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; +} + +impl Default for TenantConfigToml { + fn default() -> Self { + use tenant_conf_defaults::*; + Self { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: crate::models::CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), + eviction_policy: crate::models::EvictionPolicy::NoEviction, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::models::ThrottleConfig::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(), + lsn_lease_length: LsnLease::DEFAULT_LENGTH, + lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, + } + } +} diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 87e8f8305a..d13d04eb1b 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -6,6 +6,7 @@ pub use utilization::PageserverUtilization; use std::{ collections::HashMap, + fmt::Display, io::{BufRead, Read}, num::{NonZeroU32, NonZeroU64, NonZeroUsize}, str::FromStr, @@ -435,7 +436,9 @@ pub enum CompactionAlgorithm { Tiered, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, +)] pub enum ImageCompressionAlgorithm { // Disabled for writes, support decompressing during read path Disabled, @@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm { } } +impl Display for ImageCompressionAlgorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), + ImageCompressionAlgorithm::Zstd { level } => { + if let Some(level) = level { + write!(f, "zstd({})", level) + } else { + write!(f, "zstd") + } + } + } + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, } +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -1656,21 +1681,33 @@ mod tests { #[test] fn test_image_compression_algorithm_parsing() { use ImageCompressionAlgorithm::*; - assert_eq!( - ImageCompressionAlgorithm::from_str("disabled").unwrap(), - Disabled - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd").unwrap(), - Zstd { level: None } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(), - Zstd { level: Some(18) } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(), - Zstd { level: Some(-3) } - ); + let cases = [ + ("disabled", Disabled), + ("zstd", Zstd { level: None }), + ("zstd(18)", Zstd { level: Some(18) }), + ("zstd(-3)", Zstd { level: Some(-3) }), + ]; + + for (display, expected) in cases { + assert_eq!( + ImageCompressionAlgorithm::from_str(display).unwrap(), + expected, + "parsing works" + ); + assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip"); + + let ser = serde_json::to_string(&expected).expect("serialization"); + assert_eq!( + serde_json::from_str::(&ser).unwrap(), + expected, + "serde roundtrip" + ); + + assert_eq!( + serde_json::Value::String(display.to_string()), + serde_json::to_value(expected).unwrap(), + "Display is the serde serialization" + ); + } } } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index fa3f2cba58..f819a1572a 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -235,6 +235,31 @@ timeout = '5s'"; ); } + #[test] + fn test_storage_class_serde_roundtrip() { + let classes = [ + None, + Some(StorageClass::Standard), + Some(StorageClass::IntelligentTiering), + ]; + for class in classes { + #[derive(Serialize, Deserialize)] + struct Wrapper { + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class" + )] + class: Option, + } + let wrapped = Wrapper { + class: class.clone(), + }; + let serialized = serde_json::to_string(&wrapped).unwrap(); + let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap(); + assert_eq!(class, deserialized.class); + } + } + #[test] fn test_azure_parsing() { let toml = "\ diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f7b73dc984..71af43a4da 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; -#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[derive( + EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy, +)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, @@ -274,6 +276,14 @@ impl From for SecretString { } } +impl FromStr for SecretString { + type Err = std::convert::Infallible; + + fn from_str(s: &str) -> Result { + Ok(Self(s.to_string())) + } +} + impl std::fmt::Debug for SecretString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[SECRET]") diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9c02ce3fbc..24373afca3 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] +testing = ["fail/failpoints", "pageserver_api/testing" ] [dependencies] anyhow.workspace = true @@ -101,6 +101,7 @@ procfs.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } +indoc.workspace = true [[bench]] name = "bench_layer_map" diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 1be4391d81..72cbb6beab 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -4,7 +4,7 @@ use bytes::Bytes; use camino::Utf8PathBuf; use criterion::{criterion_group, criterion_main, Criterion}; use pageserver::{ - config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf}, + config::PageServerConf, context::{DownloadBehavior, RequestContext}, l0_flush::{L0FlushConfig, L0FlushGlobalState}, page_cache, @@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) { virtual_file::init( 16384, virtual_file::io_engine_for_bench(), - DEFAULT_IO_BUFFER_ALIGNMENT, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, ); page_cache::init(conf.page_cache_size); diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 8092c203c3..a07107753e 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -4,7 +4,6 @@ use anyhow::Result; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -148,7 +147,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { pageserver::virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, - DEFAULT_IO_BUFFER_ALIGNMENT, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, ); pageserver::page_cache::init(100); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index e0f978eaa2..dd753398e2 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -3,7 +3,6 @@ use std::path::{Path, PathBuf}; use anyhow::Result; use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; -use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::BlockCursor; @@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { pageserver::virtual_file::init( 10, virtual_file::api::IoEngineKind::StdFs, - DEFAULT_IO_BUFFER_ALIGNMENT, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, ); pageserver::page_cache::init(100); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 7a6c7675bb..3b66b0c4aa 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -20,14 +20,13 @@ use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; use layers::LayerCmd; use pageserver::{ - config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, context::{DownloadBehavior, RequestContext}, page_cache, task_mgr::TaskKind, tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 850bd87b95..2c60e8d7d1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -5,6 +5,7 @@ use std::env; use std::env::{var, VarError}; use std::io::Read; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -223,27 +224,15 @@ fn initialize_config( } }; - let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { - Ok(mut f) => { - let md = f.metadata().context("stat config file")?; - if md.is_file() { - let mut s = String::new(); - f.read_to_string(&mut s).context("read config file")?; - s.parse().context("parse config file toml")? - } else { - anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); - } - } - Err(e) => { - anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); - } - }; - - debug!("Using pageserver toml: {config}"); - - // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) - .context("Failed to parse pageserver configuration")?; + let config_file_contents = + std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; + let config_toml = serde_path_to_error::deserialize( + toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?, + ) + .context("deserialize config toml")?; + let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) + .context("runtime-validation of config toml")?; Ok(Box::leak(Box::new(conf))) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 9e4530ba3c..c159b66905 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,11 +4,13 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; +use anyhow::{bail, ensure, Context}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::{ + config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, + shard::TenantShardId, +}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde::de::IntoDeserializer; -use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -17,10 +19,8 @@ use utils::logging::SecretString; use once_cell::sync::OnceCell; use reqwest::Url; use std::num::NonZeroUsize; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; @@ -29,139 +29,27 @@ use utils::{ logging::LogFormat, }; -use crate::l0_flush::L0FlushConfig; -use crate::tenant::config::TenantConfOpt; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; -use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; -use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; -use crate::{tenant::config::TenantConf, virtual_file}; +use crate::virtual_file; +use crate::virtual_file::io_engine; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; -use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; - -use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE; - -pub mod defaults { - use crate::tenant::config::defaults::*; - use const_format::formatcp; - - pub use pageserver_api::config::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, - DEFAULT_PG_LISTEN_PORT, - }; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; - pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - - pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; - - pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; - pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; - - pub const DEFAULT_LOG_FORMAT: &str = "plain"; - - pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; - - pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = - super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); - - pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; - pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; - - pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; - pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; - - #[cfg(target_os = "linux")] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring"; - - #[cfg(not(target_os = "linux"))] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - - pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - - pub const DEFAULT_GET_IMPL: &str = "vectored"; - - pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB - - pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)"; - - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; - - pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - - pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; - - /// - /// Default built-in configuration file. - /// - pub const DEFAULT_CONFIG_FILE: &str = formatcp!( - r#" -# Initial configuration file created by 'pageserver --init' -#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' -#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' - -#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' -#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' - -#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} -#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} - -# initial superuser role name to use when creating a new tenant -#initial_superuser_name = '{DEFAULT_SUPERUSER}' - -#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}' - -#log_format = '{DEFAULT_LOG_FORMAT}' - -#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' -#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' - -#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - -#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} - -#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' - -#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} - -#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' - -#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' - -[tenant_config] -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} -#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes -#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} -#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} -#pitr_interval = '{DEFAULT_PITR_INTERVAL}' - -#min_resident_size_override = .. # in bytes -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' - -#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} -#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} - -#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} - -#[remote_storage] - -"# - ); -} - +/// Global state of pageserver. +/// +/// It's mostly immutable configuration, but some semaphores and the +/// like crept in over time and the name stuck. +/// +/// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] +/// and passing that to [`PageServerConf::parse_and_validate`]. +/// +/// # Adding a New Field +/// +/// 1. Add the field to `pageserver_api::config::ConfigToml`. +/// 2. Fix compiler errors (exhaustive destructuring will guide you). +/// +/// For fields that require additional validation or filling in of defaults at runtime, +/// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers @@ -207,7 +95,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, - pub default_tenant_conf: TenantConf, + pub default_tenant_conf: crate::tenant::config::TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, @@ -284,11 +172,11 @@ pub struct PageServerConf { /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, - pub l0_flush: L0FlushConfig, + pub l0_flush: crate::l0_flush::L0FlushConfig, /// This flag is temporary and will be removed after gradual rollout. /// See . - pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess, /// Direct IO settings pub virtual_file_direct_io: virtual_file::DirectIoMode, @@ -304,472 +192,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -// use dedicated enum for builder to better indicate the intention -// and avoid possible confusion with nested options -#[derive(Clone, Default)] -pub enum BuilderValue { - Set(T), - #[default] - NotSet, -} - -impl BuilderValue { - pub fn ok_or(&self, field_name: &'static str, default: BuilderValue) -> anyhow::Result { - match self { - Self::Set(v) => Ok(v.clone()), - Self::NotSet => match default { - BuilderValue::Set(v) => Ok(v.clone()), - BuilderValue::NotSet => { - anyhow::bail!("missing config value {field_name:?}") - } - }, - } - } -} - -// needed to simplify config construction -#[derive(Default)] -struct PageServerConfigBuilder { - listen_pg_addr: BuilderValue, - - listen_http_addr: BuilderValue, - - availability_zone: BuilderValue>, - - wait_lsn_timeout: BuilderValue, - wal_redo_timeout: BuilderValue, - - superuser: BuilderValue, - - page_cache_size: BuilderValue, - max_file_descriptors: BuilderValue, - - workdir: BuilderValue, - - pg_distrib_dir: BuilderValue, - - http_auth_type: BuilderValue, - pg_auth_type: BuilderValue, - - // - auth_validation_public_key_path: BuilderValue>, - remote_storage_config: BuilderValue>, - - broker_endpoint: BuilderValue, - broker_keepalive_interval: BuilderValue, - - log_format: BuilderValue, - - concurrent_tenant_warmup: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, - - metric_collection_interval: BuilderValue, - metric_collection_endpoint: BuilderValue>, - synthetic_size_calculation_interval: BuilderValue, - metric_collection_bucket: BuilderValue>, - - disk_usage_based_eviction: BuilderValue>, - - test_remote_failures: BuilderValue, - - ondemand_download_behavior_treat_error_as_warn: BuilderValue, - - background_task_maximum_delay: BuilderValue, - - control_plane_api: BuilderValue>, - control_plane_api_token: BuilderValue>, - control_plane_emergency_mode: BuilderValue, - - heatmap_upload_concurrency: BuilderValue, - secondary_download_concurrency: BuilderValue, - - ingest_batch_size: BuilderValue, - - virtual_file_io_engine: BuilderValue, - - max_vectored_read_bytes: BuilderValue, - - image_compression: BuilderValue, - - ephemeral_bytes_per_memory_kb: BuilderValue, - - l0_flush: BuilderValue, - - compact_level0_phase1_value_access: BuilderValue, - - virtual_file_direct_io: BuilderValue, - - io_buffer_alignment: BuilderValue, -} - -impl PageServerConfigBuilder { - fn new() -> Self { - Self::default() - } - - #[inline(always)] - fn default_values() -> Self { - use self::BuilderValue::*; - use defaults::*; - Self { - listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), - listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), - availability_zone: Set(None), - wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) - .expect("cannot parse default wait lsn timeout")), - wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) - .expect("cannot parse default wal redo timeout")), - superuser: Set(DEFAULT_SUPERUSER.to_string()), - page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), - max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), - workdir: Set(Utf8PathBuf::new()), - pg_distrib_dir: Set(Utf8PathBuf::from_path_buf( - env::current_dir().expect("cannot access current directory"), - ) - .expect("non-Unicode path") - .join("pg_install")), - http_auth_type: Set(AuthType::Trust), - pg_auth_type: Set(AuthType::Trust), - auth_validation_public_key_path: Set(None), - remote_storage_config: Set(None), - broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT - .parse() - .expect("failed to parse default broker endpoint")), - broker_keepalive_interval: Set(humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL, - ) - .expect("cannot parse default keepalive interval")), - log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - - concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant")), - concurrent_tenant_size_logical_size_queries: Set( - ConfigurableSemaphore::DEFAULT_INITIAL, - ), - metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default metric collection interval")), - synthetic_size_calculation_interval: Set(humantime::parse_duration( - DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, - ) - .expect("cannot parse default synthetic size calculation interval")), - metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - - metric_collection_bucket: Set(None), - - disk_usage_based_eviction: Set(None), - - test_remote_failures: Set(0), - - ondemand_download_behavior_treat_error_as_warn: Set(false), - - background_task_maximum_delay: Set(humantime::parse_duration( - DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, - ) - .unwrap()), - - control_plane_api: Set(None), - control_plane_api_token: Set(None), - control_plane_emergency_mode: Set(false), - - heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), - secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), - - ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), - - virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - - max_vectored_read_bytes: Set(MaxVectoredReadBytes( - NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), - )), - image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()), - ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - l0_flush: Set(L0FlushConfig::default()), - compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), - virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), - io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT), - } - } -} - -impl PageServerConfigBuilder { - pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { - self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) - } - - pub fn listen_http_addr(&mut self, listen_http_addr: String) { - self.listen_http_addr = BuilderValue::Set(listen_http_addr) - } - - pub fn availability_zone(&mut self, availability_zone: Option) { - self.availability_zone = BuilderValue::Set(availability_zone) - } - - pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { - self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) - } - - pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { - self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) - } - - pub fn superuser(&mut self, superuser: String) { - self.superuser = BuilderValue::Set(superuser) - } - - pub fn page_cache_size(&mut self, page_cache_size: usize) { - self.page_cache_size = BuilderValue::Set(page_cache_size) - } - - pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { - self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) - } - - pub fn workdir(&mut self, workdir: Utf8PathBuf) { - self.workdir = BuilderValue::Set(workdir) - } - - pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) { - self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) - } - - pub fn http_auth_type(&mut self, auth_type: AuthType) { - self.http_auth_type = BuilderValue::Set(auth_type) - } - - pub fn pg_auth_type(&mut self, auth_type: AuthType) { - self.pg_auth_type = BuilderValue::Set(auth_type) - } - - pub fn auth_validation_public_key_path( - &mut self, - auth_validation_public_key_path: Option, - ) { - self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) - } - - pub fn remote_storage_config(&mut self, remote_storage_config: Option) { - self.remote_storage_config = BuilderValue::Set(remote_storage_config) - } - - pub fn broker_endpoint(&mut self, broker_endpoint: Uri) { - self.broker_endpoint = BuilderValue::Set(broker_endpoint) - } - - pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { - self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) - } - - pub fn log_format(&mut self, log_format: LogFormat) { - self.log_format = BuilderValue::Set(log_format) - } - - pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_warmup = BuilderValue::Set(u); - } - - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); - } - - pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { - self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) - } - - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { - self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) - } - - pub fn metric_collection_bucket( - &mut self, - metric_collection_bucket: Option, - ) { - self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) - } - - pub fn synthetic_size_calculation_interval( - &mut self, - synthetic_size_calculation_interval: Duration, - ) { - self.synthetic_size_calculation_interval = - BuilderValue::Set(synthetic_size_calculation_interval) - } - - pub fn test_remote_failures(&mut self, fail_first: u64) { - self.test_remote_failures = BuilderValue::Set(fail_first); - } - - pub fn disk_usage_based_eviction(&mut self, value: Option) { - self.disk_usage_based_eviction = BuilderValue::Set(value); - } - - pub fn ondemand_download_behavior_treat_error_as_warn( - &mut self, - ondemand_download_behavior_treat_error_as_warn: bool, - ) { - self.ondemand_download_behavior_treat_error_as_warn = - BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); - } - - pub fn background_task_maximum_delay(&mut self, delay: Duration) { - self.background_task_maximum_delay = BuilderValue::Set(delay); - } - - pub fn control_plane_api(&mut self, api: Option) { - self.control_plane_api = BuilderValue::Set(api) - } - - pub fn control_plane_api_token(&mut self, token: Option) { - self.control_plane_api_token = BuilderValue::Set(token) - } - - pub fn control_plane_emergency_mode(&mut self, enabled: bool) { - self.control_plane_emergency_mode = BuilderValue::Set(enabled) - } - - pub fn heatmap_upload_concurrency(&mut self, value: usize) { - self.heatmap_upload_concurrency = BuilderValue::Set(value) - } - - pub fn secondary_download_concurrency(&mut self, value: usize) { - self.secondary_download_concurrency = BuilderValue::Set(value) - } - - pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { - self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) - } - - pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { - self.virtual_file_io_engine = BuilderValue::Set(value); - } - - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { - self.max_vectored_read_bytes = BuilderValue::Set(value); - } - - pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { - self.image_compression = BuilderValue::Set(value); - } - - pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { - self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); - } - - pub fn l0_flush(&mut self, value: L0FlushConfig) { - self.l0_flush = BuilderValue::Set(value); - } - - pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) { - self.compact_level0_phase1_value_access = BuilderValue::Set(value); - } - - pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { - self.virtual_file_direct_io = BuilderValue::Set(value); - } - - pub fn io_buffer_alignment(&mut self, value: usize) { - self.io_buffer_alignment = BuilderValue::Set(value); - } - - pub fn build(self, id: NodeId) -> anyhow::Result { - let default = Self::default_values(); - - macro_rules! conf { - (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => { - PageServerConf { - $( - $field: self.$field.ok_or(stringify!($field), default.$field)?, - )* - $( - $custom_field: $custom_value, - )* - } - }; - } - - Ok(conf!( - USING DEFAULT - { - listen_pg_addr, - listen_http_addr, - availability_zone, - wait_lsn_timeout, - wal_redo_timeout, - superuser, - page_cache_size, - max_file_descriptors, - workdir, - pg_distrib_dir, - http_auth_type, - pg_auth_type, - auth_validation_public_key_path, - remote_storage_config, - broker_endpoint, - broker_keepalive_interval, - log_format, - metric_collection_interval, - metric_collection_endpoint, - metric_collection_bucket, - synthetic_size_calculation_interval, - disk_usage_based_eviction, - test_remote_failures, - ondemand_download_behavior_treat_error_as_warn, - background_task_maximum_delay, - control_plane_api, - control_plane_api_token, - control_plane_emergency_mode, - heatmap_upload_concurrency, - secondary_download_concurrency, - ingest_batch_size, - max_vectored_read_bytes, - image_compression, - ephemeral_bytes_per_memory_kb, - l0_flush, - compact_level0_phase1_value_access, - virtual_file_direct_io, - io_buffer_alignment, - } - CUSTOM LOGIC - { - id: id, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - concurrent_tenant_warmup: ConfigurableSemaphore::new({ - self - .concurrent_tenant_warmup - .ok_or("concurrent_tenant_warmpup", - default.concurrent_tenant_warmup)? - }), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - self - .concurrent_tenant_size_logical_size_queries - .ok_or("concurrent_tenant_size_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())? - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - // re-use `concurrent_tenant_size_logical_size_queries` - self - .concurrent_tenant_size_logical_size_queries - .ok_or("eviction_task_immitated_concurrent_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())?, - ), - virtual_file_io_engine: match self.virtual_file_io_engine { - BuilderValue::Set(v) => v, - BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? { - io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise - io_engine::FeatureTestResult::Worse { engine, remark } => { - // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); - engine - } - }, - }, - } - )) - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -878,134 +300,135 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate( - node_id: NodeId, - toml: &Document, + id: NodeId, + config_toml: pageserver_api::config::ConfigToml, workdir: &Utf8Path, ) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::new(); - builder.workdir(workdir.to_owned()); + let pageserver_api::config::ConfigToml { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access, + l0_flush, + virtual_file_direct_io, + concurrent_tenant_warmup, + concurrent_tenant_size_logical_size_queries, + virtual_file_io_engine, + io_buffer_alignment, + tenant_config, + } = config_toml; - let mut t_conf = TenantConfOpt::default(); + let mut conf = PageServerConf { + // ------------------------------------------------------------ + // fields that are already fully validated by the ConfigToml Deserialize impl + // ------------------------------------------------------------ + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config: remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access, + virtual_file_direct_io, + io_buffer_alignment, - for (key, item) in toml.iter() { - match key { - "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), - "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), - "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), - "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), - "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), - "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), - "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), - "max_file_descriptors" => { - builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) - } - "pg_distrib_dir" => { - builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( - Utf8PathBuf::from(parse_toml_string(key, item)?), - )), - "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), - "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), - "remote_storage" => { - builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) - } - "tenant_config" => { - t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; - } - "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), - "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), - "log_format" => builder.log_format( - LogFormat::from_config(&parse_toml_string(key, item)?)? - ), - "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "metric_collection_endpoint" => { - let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; - builder.metric_collection_endpoint(Some(endpoint)); - }, - "metric_collection_bucket" => { - builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) - } - "synthetic_size_calculation_interval" => - builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), - "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "disk_usage_based_eviction" => { - tracing::info!("disk_usage_based_eviction: {:#?}", &item); - builder.disk_usage_based_eviction( - deserialize_from_item("disk_usage_based_eviction", item) - .context("parse disk_usage_based_eviction")? - ) - }, - "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), - "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api(None) - } else { - builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + // ------------------------------------------------------------ + // fields that require additional validation or custom handling + // ------------------------------------------------------------ + workdir: workdir.to_owned(), + pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { + std::env::current_dir() + .expect("current_dir() failed") + .try_into() + .expect("current_dir() is not a valid Utf8Path") + }), + control_plane_api_token: control_plane_api_token.map(SecretString::from), + id, + default_tenant_conf: tenant_config, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + concurrent_tenant_size_logical_size_queries, + ), + virtual_file_io_engine: match virtual_file_io_engine { + Some(v) => v, + None => match crate::virtual_file::io_engine_feature_test() + .context("auto-detect virtual_file_io_engine")? + { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine } }, - "control_plane_api_token" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api_token(None) - } else { - builder.control_plane_api_token(Some(parsed.into())) - } - }, - "control_plane_emergency_mode" => { - builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - }, - "heatmap_upload_concurrency" => { - builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) - }, - "secondary_download_concurrency" => { - builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) - }, - "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), - "virtual_file_io_engine" => { - builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) - } - "max_vectored_read_bytes" => { - let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; - builder.get_max_vectored_read_bytes( - MaxVectoredReadBytes( - NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) - } - "image_compression" => { - builder.get_image_compression(parse_toml_from_str("image_compression", item)?) - } - "ephemeral_bytes_per_memory_kb" => { - builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) - } - "l0_flush" => { - builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) - } - "compact_level0_phase1_value_access" => { - builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) - } - "virtual_file_direct_io" => { - builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) - } - "io_buffer_alignment" => { - builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize) - } - _ => bail!("unrecognized pageserver option '{key}'"), - } - } + }, + l0_flush: l0_flush + .map(crate::l0_flush::L0FlushConfig::from) + .unwrap_or_default(), + }; - let mut conf = builder.build(node_id).context("invalid config")?; + // ------------------------------------------------------------ + // custom validation code that covers more than one field in isolation + // ------------------------------------------------------------ if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1019,10 +442,8 @@ impl PageServerConf { ); } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); - IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) - .map_err(|msg| anyhow::anyhow!("{msg}")) + .map_err(anyhow::Error::msg) .with_context(|| { format!( "effective checkpoint distance is unsupported: {}", @@ -1042,130 +463,25 @@ impl PageServerConf { pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); - PageServerConf { - id: NodeId(0), + let config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - superuser: "cloud_admin".to_string(), - workdir: repo_dir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5000), - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant"), - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( - ), + pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant"), - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - } + ..Default::default() + }; + PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } -#[derive(Deserialize)] +#[derive(serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } -// Helper functions to parse a toml Item - -fn parse_toml_string(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - -fn parse_toml_u64(name: &str, item: &Item) -> Result { - // A toml integer is signed, so it cannot represent the full range of an u64. That's OK - // for our use, though. - let i: i64 = item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?; - if i < 0 { - bail!("configure option {name} cannot be negative"); - } - Ok(i as u64) -} - -fn parse_toml_bool(name: &str, item: &Item) -> Result { - item.as_bool() - .with_context(|| format!("configure option {name} is not a bool")) -} - -fn parse_toml_duration(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - - Ok(humantime::parse_duration(s)?) -} - -fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result -where - T: FromStr, - ::Err: std::fmt::Display, -{ - let v = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v).map_err(|e| { - anyhow!( - "Failed to parse string as {parse_type} for configure option {name}: {e}", - parse_type = stringify!(T) - ) - }) -} - -fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result -where - T: serde::de::DeserializeOwned, -{ - // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way - let deserializer = match item.clone().into_value() { - Ok(value) => value.into_deserializer(), - Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), - }; - T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) -} - /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -1227,469 +543,108 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{fs, num::NonZeroU32}; - use camino_tempfile::{tempdir, Utf8TempDir}; - use pageserver_api::models::EvictionPolicy; - use remote_storage::{RemoteStorageKind, S3Config}; - use utils::serde_percent::Percent; + use camino::Utf8PathBuf; + use utils::id::NodeId; - use super::*; - use crate::DEFAULT_PG_VERSION; - - const ALL_BASE_VALUES_TOML: &str = r#" -# Initial configuration file created by 'pageserver --init' - -listen_pg_addr = '127.0.0.1:64000' -listen_http_addr = '127.0.0.1:9898' - -wait_lsn_timeout = '111 s' -wal_redo_timeout = '111 s' - -page_cache_size = 444 -max_file_descriptors = 333 - -# initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zzzz' - -metric_collection_interval = '222 s' -metric_collection_endpoint = 'http://localhost:80/metrics' -synthetic_size_calculation_interval = '333 s' - -log_format = 'json' -background_task_maximum_delay = '334 s' - -"#; + use super::PageServerConf; #[test] - fn parse_defaults() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - // we have to create dummy values to overcome the validation errors - let config_string = - format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, - superuser: defaults::DEFAULT_SUPERUSER.to_string(), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL - )?, - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_METRIC_COLLECTION_INTERVAL - )?, - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, - synthetic_size_calculation_interval: humantime::parse_duration( - defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL - )?, - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: humantime::parse_duration( - defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY - )?, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - }, - "Correct defaults should be used when no config values are provided" - ); - - Ok(()) - } - - #[test] - fn parse_basic_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - - let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'", - ); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: "127.0.0.1:64000".to_string(), - listen_http_addr: "127.0.0.1:9898".to_string(), - availability_zone: None, - wait_lsn_timeout: Duration::from_secs(111), - wal_redo_timeout: Duration::from_secs(111), - superuser: "zzzz".to_string(), - page_cache_size: 444, - max_file_descriptors: 333, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5), - log_format: LogFormat::Json, - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: Duration::from_secs(222), - metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), - metric_collection_bucket: None, - synthetic_size_calculation_interval: Duration::from_secs(333), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: Duration::from_secs(334), - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: 100, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - }, - "Should be able to parse all basic config values correctly" - ); - - Ok(()) - } - - #[test] - fn parse_remote_fs_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; - - let local_storage_path = tempdir.path().join("local_remote_storage"); - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -local_path = '{local_storage_path}'"#, - ), - format!("remote_storage={{local_path='{local_storage_path}'}}"), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() }, - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); - } - Ok(()) - } - - #[test] - fn parse_remote_s3_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let bucket_name = "some-sample-bucket".to_string(); - let bucket_region = "eu-north-1".to_string(); - let prefix_in_bucket = "test_prefix".to_string(); - let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); - let max_sync_errors = NonZeroU32::new(222).unwrap(); - let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); - let broker_endpoint = "http://127.0.0.1:7777"; - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -max_concurrent_syncs = {max_concurrent_syncs} -max_sync_errors = {max_sync_errors} -bucket_name = '{bucket_name}' -bucket_region = '{bucket_region}' -prefix_in_bucket = '{prefix_in_bucket}' -endpoint = '{endpoint}' -concurrency_limit = {s3_concurrency_limit}"# - ), - format!( - "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", - ), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(S3Config { - bucket_name: bucket_name.clone(), - bucket_region: bucket_region.clone(), - prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()), - concurrency_limit: s3_concurrency_limit, - max_keys_per_list_response: None, - upload_storage_class: None, - }), - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the S3 config" - ); - } - Ok(()) - } - - #[test] - fn parse_incorrect_tenant_config() -> anyhow::Result<()> { - let config_string = r#" - [tenant_config] - checkpoint_distance = -1 # supposed to be an u64 - "# - .to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err(); - - let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64"; - assert_eq!(error.to_string(), expected_error_str); - - Ok(()) - } - - #[test] - fn parse_override_tenant_config() -> anyhow::Result<()> { - let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let conf = TenantConfOpt::try_from(item.to_owned()).unwrap(); - - assert_eq!(conf.min_resident_size_override, Some(400)); - - Ok(()) - } - - #[test] - fn eviction_pageserver_config_parse() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[disk_usage_based_eviction] -max_usage_pct = 80 -min_avail_bytes = 0 -period = "10s" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "LayerAccessThreshold" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; - - assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); - assert_eq!( - conf.metric_collection_endpoint, - Some("http://sample.url".parse().unwrap()) - ); - assert_eq!( - conf.metric_collection_interval, - Duration::from_secs(10 * 60) - ); - assert_eq!( - conf.default_tenant_conf - .evictions_low_residence_duration_metric_threshold, - Duration::from_secs(20 * 60) - ); - - // Assert that the node id provided by the indentity file (threaded - // through the call to [`PageServerConf::parse_and_validate`] is - // used. - assert_eq!(conf.id, NodeId(333)); - assert_eq!( - conf.disk_usage_based_eviction, - Some(DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 0, - period: Duration::from_secs(10), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: Default::default(), - }) - ); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::LayerAccessThreshold(eviction_threshold) => { - assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - - Ok(()) - } - - #[test] - fn parse_imitation_only_pageserver_config() { - let tempdir = tempdir().unwrap(); - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap(); - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "OnlyImitiate" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::OnlyImitiate(t) => { - assert_eq!(t.period, Duration::from_secs(20 * 60)); - assert_eq!(t.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - } - - #[test] - fn empty_remote_storage_is_error() { - let tempdir = tempdir().unwrap(); - let (workdir, _) = prepare_fs(&tempdir).unwrap(); + fn test_empty_config_toml_is_valid() { + // we use Default impl of everything in this situation let input = r#" -remote_storage = {} "#; - let doc = toml_edit::Document::from_str(input).unwrap(); - let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) - .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); - assert!(format!("{err}").contains("remote_storage"), "{err}"); + let config_toml = toml_edit::de::from_str::(input) + .expect("empty config is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); } - fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { - let tempdir_path = tempdir.path(); + /// If there's a typo in the pageserver config, we'd rather catch that typo + /// and fail pageserver startup than silently ignoring the typo, leaving whoever + /// made it in the believe that their config change is effective. + /// + /// The default in serde is to allow unknown fields, so, we rely + /// on developer+review discipline to add `deny_unknown_fields` when adding + /// new structs to the config, and these tests here as a regression test. + /// + /// The alternative to all of this would be to allow unknown fields in the config. + /// To catch them, we could have a config check tool or mgmt API endpoint that + /// compares the effective config with the TOML on disk and makes sure that + /// the on-disk TOML is a strict subset of the effective config. + mod unknown_fields_handling { + macro_rules! test { + ($short_name:ident, $input:expr) => { + #[test] + fn $short_name() { + let input = $input; + let err = toml_edit::de::from_str::(&input) + .expect_err("some_invalid_field is an invalid field"); + dbg!(&err); + assert!(err.to_string().contains("some_invalid_field")); + } + }; + } + use indoc::indoc; - let workdir = tempdir_path.join("workdir"); - fs::create_dir_all(&workdir)?; + test!( + toplevel, + indoc! {r#" + some_invalid_field = 23 + "#} + ); - let pg_distrib_dir = tempdir_path.join("pg_distrib"); - let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir_versioned)?; - let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); - fs::create_dir_all(&postgres_bin_dir)?; - fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; + test!( + toplevel_nested, + indoc! {r#" + [some_invalid_field] + foo = 23 + "#} + ); - Ok((workdir, pg_distrib_dir)) + test!( + disk_usage_based_eviction, + indoc! {r#" + [disk_usage_based_eviction] + some_invalid_field = 23 + "#} + ); + + test!( + tenant_config, + indoc! {r#" + [tenant_config] + some_invalid_field = 23 + "#} + ); + + test!( + l0_flush, + indoc! {r#" + [l0_flush] + mode = "direct" + some_invalid_field = 23 + "#} + ); + + // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 + // test!( + // remote_storage_config, + // indoc! {r#" + // [remote_storage_config] + // local_path = "/nonexistent" + // some_invalid_field = 23 + // "#} + // ); + + test!( + compact_level0_phase1_value_access, + indoc! {r#" + [compact_level0_phase1_value_access] + mode = "streaming-kmerge" + some_invalid_field = 23 + "#} + ); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 5e4a49bc56..a58fa2c0b1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,19 +41,15 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::serde_percent::Percent; use utils::{completion, id::TimelineId}; use crate::{ @@ -69,23 +65,9 @@ use crate::{ CancellableTask, DiskUsageEvictionTask, }; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DiskUsageEvictionTaskConfig { - pub max_usage_pct: Percent, - pub min_avail_bytes: u64, - #[serde(with = "humantime_serde")] - pub period: Duration, - #[cfg(feature = "testing")] - pub mock_statvfs: Option, - /// Select sorting for evicted layers - #[serde(default)] - pub eviction_order: EvictionOrder, -} - /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "type", content = "args")] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. @@ -96,23 +78,22 @@ pub enum EvictionOrder { /// we read tenants is deterministic. If we find the need to use this as `false`, we need /// to ensure nondeterminism by adding in a random number to break the /// `relative_last_activity==0.0` ties. - #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } -impl Default for EvictionOrder { - fn default() -> Self { - Self::RelativeAccessed { - highest_layer_count_loses_first: true, +impl From for EvictionOrder { + fn from(value: pageserver_api::config::EvictionOrder) -> Self { + match value { + pageserver_api::config::EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => Self::RelativeAccessed { + highest_layer_count_loses_first, + }, } } } -fn default_highest_layer_count_loses_first() -> bool { - true -} - impl EvictionOrder { fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; @@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration( storage, usage_pre, tenant_manager, - task_config.eviction_order, + task_config.eviction_order.into(), cancel, ) .await; @@ -1257,7 +1238,6 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -1269,7 +1249,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: EvictionOrder::default(), + eviction_order: pageserver_api::config::EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 90ae6c5557..d645f3b7b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2076,7 +2076,7 @@ async fn disk_usage_eviction_run( evict_bytes: u64, #[serde(default)] - eviction_order: crate::disk_usage_eviction_task::EvictionOrder, + eviction_order: pageserver_api::config::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] @@ -2112,7 +2112,7 @@ async fn disk_usage_eviction_run( &state.remote_storage, usage, &state.tenant_manager, - config.eviction_order, + config.eviction_order.into(), &cancel, ) .await; diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 313a7961a6..491c9fb96c 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,9 +1,7 @@ use std::{num::NonZeroUsize, sync::Arc}; -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +#[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { - #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, } @@ -16,6 +14,16 @@ impl Default for L0FlushConfig { } } +impl From for L0FlushConfig { + fn from(config: pageserver_api::models::L0FlushConfig) -> Self { + match config { + pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => { + Self::Direct { max_concurrency } + } + } + } +} + #[derive(Clone)] pub struct L0FlushGlobalState(Arc); diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index ede1791afa..5a6f6e5176 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -60,32 +60,7 @@ pub mod mock { use regex::Regex; use tracing::log::info; - #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[serde(tag = "type")] - pub enum Behavior { - Success { - blocksize: u64, - total_blocks: u64, - name_filter: Option, - }, - Failure { - mocked_error: MockedError, - }, - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[allow(clippy::upper_case_acronyms)] - pub enum MockedError { - EIO, - } - - impl From for nix::Error { - fn from(e: MockedError) -> Self { - match e { - MockedError::EIO => nix::Error::EIO, - } - } - } + pub use pageserver_api::config::statvfs::mock::Behavior; pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -116,6 +91,7 @@ pub mod mock { block_size: *blocksize, }) } + #[cfg(feature = "testing")] Behavior::Failure { mocked_error } => Err((*mocked_error).into()), } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48ff17db94..7e0344666b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,11 +9,10 @@ //! may lead to a data loss. //! use anyhow::bail; +pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::AuxFilePolicy; -use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::LsnLease; use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; @@ -23,50 +22,6 @@ use std::num::NonZeroU64; use std::time::Duration; use utils::generation::Generation; -pub mod defaults { - - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - // This parameter actually determines L0 layer file size. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; - - // FIXME the below configs are only used by legacy algorithm. The new algorithm - // has different parameters. - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - - pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; - pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm = - super::CompactionAlgorithm::Legacy; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - - // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. - // If there's a need to decrease this value, first make sure that GC - // doesn't hold a layer map write lock for non-trivial operations. - // Relevant: https://github.com/neondatabase/neon/issues/3394 - pub const DEFAULT_GC_PERIOD: &str = "1 hr"; - pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; - pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - // The default limit on WAL lag should be set to avoid causing disconnects under high throughput - // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for - // throughputs up to 1GiB/s per timeline. - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; - // By default ingest enough WAL for two new L0 layers before checking if new image - // image layers should be created. - pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; -} - #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached @@ -281,96 +236,20 @@ impl LocationConf { } } -/// A tenant's calcuated configuration, which is the result of merging a -/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. -/// -/// For storing and transmitting individual tenant's configuration, see -/// TenantConfOpt. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct TenantConf { - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - // This parameter actually determines L0 layer file size. - pub checkpoint_distance: u64, - // Inmemory layer is also flushed at least once in checkpoint_timeout to - // eventually upload WAL after activity is stopped. - #[serde(with = "humantime_serde")] - pub checkpoint_timeout: Duration, - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub compaction_target_size: u64, - // How often to check if there's compaction work to be done. - // Duration::ZERO means automatic compaction is disabled. - #[serde(with = "humantime_serde")] - pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. - pub compaction_threshold: usize, - pub compaction_algorithm: CompactionAlgorithmSettings, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is #of bytes of WAL. - // Page versions older than this are garbage collected away. - pub gc_horizon: u64, - // Interval at which garbage collection is triggered. - // Duration::ZERO means automatic GC is disabled - #[serde(with = "humantime_serde")] - pub gc_period: Duration, - // Delta layer churn threshold to create L1 image layers. - pub image_creation_threshold: usize, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is time. - // Page versions older than this are garbage collected away. - #[serde(with = "humantime_serde")] - pub pitr_interval: Duration, - /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. - #[serde(with = "humantime_serde")] - pub walreceiver_connect_timeout: Duration, - /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. - /// A stalled safekeeper will be changed to a newer one when it appears. - #[serde(with = "humantime_serde")] - pub lagging_wal_timeout: Duration, - /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. - /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, - /// to avoid eager reconnects. - pub max_lsn_wal_lag: NonZeroU64, - pub eviction_policy: EvictionPolicy, - pub min_resident_size_override: Option, - // See the corresponding metric's help string. - #[serde(with = "humantime_serde")] - pub evictions_low_residence_duration_metric_threshold: Duration, - - /// If non-zero, the period between uploads of a heatmap from attached tenants. This - /// may be disabled if a Tenant will not have secondary locations: only secondary - /// locations will use the heatmap uploaded by attached locations. - #[serde(with = "humantime_serde")] - pub heatmap_period: Duration, - - /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup - pub lazy_slru_download: bool, - - pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, - - // How much WAL must be ingested before checking again whether a new image layer is required. - // Expresed in multiples of checkpoint distance. - pub image_layer_creation_check_threshold: u8, - - /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into - /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. - /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux - /// file is written. - pub switch_aux_file_policy: AuxFilePolicy, - - /// The length for an explicit LSN lease request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length: Duration, - - /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length_for_ts: Duration, +impl Default for LocationConf { + // TODO: this should be removed once tenant loading can guarantee that we are never + // loading from a directory without a configuration. + // => tech debt since https://github.com/neondatabase/neon/issues/1555 + fn default() -> Self { + Self { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: Generation::none(), + attach_mode: AttachmentMode::Single, + }), + tenant_conf: TenantConfOpt::default(), + shard: ShardIdentity::unsharded(), + } + } } /// Same as TenantConf, but this struct preserves the information about @@ -545,51 +424,6 @@ impl TenantConfOpt { } } -impl Default for TenantConf { - fn default() -> Self { - use defaults::*; - Self { - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) - .expect("cannot parse default checkpoint timeout"), - compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, - compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) - .expect("cannot parse default compaction period"), - compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - compaction_algorithm: CompactionAlgorithmSettings { - kind: DEFAULT_COMPACTION_ALGORITHM, - }, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) - .expect("cannot parse default gc period"), - image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) - .expect("cannot parse default PITR interval"), - walreceiver_connect_timeout: humantime::parse_duration( - DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .expect("cannot parse default walreceiver connect timeout"), - lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) - .expect("cannot parse default walreceiver lagging wal timeout"), - max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .expect("cannot parse default max walreceiver Lsn wal lag"), - eviction_policy: EvictionPolicy::NoEviction, - min_resident_size_override: None, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - heatmap_period: Duration::ZERO, - lazy_slru_download: false, - timeline_get_throttle: crate::tenant::throttle::Config::disabled(), - image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), - lsn_lease_length: LsnLease::DEFAULT_LENGTH, - lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - } - } -} - impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { type Error = anyhow::Error; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index b8e9a98149..6a2cd94232 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadCoalesceMode, VectoredReadPlanner, }; use crate::tenant::PageReconstructError; @@ -52,6 +52,7 @@ use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 4a095c564d..77ce1ae670 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -34,8 +34,7 @@ use crate::tenant::disk_btree::{ }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadPlanner, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; @@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2c19e5b19f..e487bee1f2 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -215,7 +215,7 @@ impl IndexEntry { const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = { let res = Self::validate_checkpoint_distance( - crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE, + pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE, ); if res.is_err() { panic!("default checkpoint distance is valid") diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index f5680ced90..478e9bb4f0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; @@ -456,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken // If compaction period is set to zero (to disable it), then we will use a reasonable default let period = if period == Duration::ZERO { - humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) - .unwrap() - .into() + humantime::Duration::from_str( + pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, + ) + .unwrap() + .into() } else { period }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3b8f19a6c0..262dccac7d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -66,7 +66,6 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ - config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, @@ -102,6 +101,7 @@ use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index aad75ac59c..6b9c8386f7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -19,6 +19,7 @@ use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; +use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess}; use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; @@ -29,7 +30,6 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::split_writer::{ @@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use pageserver_api::config::tenant_conf_defaults::{ + DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, +}; use crate::keyspace::KeySpace; use crate::repository::{Key, Value}; @@ -1433,43 +1436,6 @@ impl TryFrom for CompactLevel0Phase1Stats { } } -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] -pub enum CompactL0Phase1ValueAccess { - /// The old way. - PageCachedBlobIo, - /// The new way. - StreamingKmerge { - /// If set, we run both the old way and the new way, validate that - /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), - /// and if the validation fails, - /// - in tests: fail them with a panic or - /// - in prod, log a rate-limited warning and use the old way's results. - /// - /// If not set, we only run the new way and trust its results. - validate: Option, - }, -} - -/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(rename_all = "kebab-case")] -pub enum CompactL0BypassPageCacheValidation { - /// Validate that the series of (key, lsn) pairs are the same. - KeyLsn, - /// Validate that the entire output of old and new way is identical. - KeyLsnValue, -} - -impl Default for CompactL0Phase1ValueAccess { - fn default() -> Self { - CompactL0Phase1ValueAccess::StreamingKmerge { - // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident - validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), - } - } -} - impl Timeline { /// Entry point for new tiered compaction algorithm. /// diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 146bcf0e35..4d51dc442d 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -16,7 +16,6 @@ //! Note that the vectored blob api does *not* go through the page cache. use std::collections::BTreeMap; -use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; @@ -29,9 +28,6 @@ use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; use crate::virtual_file::{self, VirtualFile}; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct MaxVectoredReadBytes(pub NonZeroUsize); - /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] pub struct BlobMeta { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 97d966e2da..ed6ff86c10 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,7 +10,6 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; @@ -19,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::io_buf_ext::FullSlice; +use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index faef1ba9ff..ccde90ee1a 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine { } }, Err(std::env::VarError::NotPresent) => { - crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE - .parse() - .unwrap() + #[cfg(target_os = "linux")] + { + IoEngineKind::TokioEpollUring + } + #[cfg(not(target_os = "linux"))] + { + IoEngineKind::StdFs + } } Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {env_var_name} is not unicode"); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 890538b86a..2df45a7e0e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,7 +24,20 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) from urllib.parse import quote, urlparse import asyncpg @@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore # reexport from .neon_api import NeonAPI, NeonApiEndpoint +T = TypeVar("T") + """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -2986,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils): def config_toml_path(self) -> Path: return self.workdir / "pageserver.toml" - def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T: """ Edit the pageserver's config toml file in place. """ path = self.config_toml_path with open(path, "r") as f: config = toml.load(f) - edit_fn(config) + res = edit_fn(config) with open(path, "w") as f: toml.dump(config, f) + return res def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: """ diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 73af7950f1..ebf58d2bd1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - replaced_config = env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": "", - } - ) + def remove_control_plane_api_field(config): + return config.pop("control_plane_api") + + control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) @@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive(replaced_config) + env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": control_plane_api, + } + ) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 642b9e449b..9bf5f8680b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # We will run with the limit set to 1, so that once we have one tenant stuck # in a pausable failpoint, the rest are prevented from proceeding through warmup. - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() @@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start()