diff --git a/Dockerfile b/Dockerfile index a41598ef72..ace112cccf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ - && /usr/local/bin/pageserver -D /data/.neon/ --init \ - -c "id=1234" \ - -c "broker_endpoint='http://storage_broker:50051'" \ - -c "pg_distrib_dir='/usr/local/'" \ - -c "listen_pg_addr='0.0.0.0:6400'" \ - -c "listen_http_addr='0.0.0.0:9898'" +RUN mkdir -p /data/.neon/ && \ + echo "id=1234" > "/data/.neon/identity.toml" && \ + echo "broker_endpoint='http://storage_broker:50051'\n" \ + "pg_distrib_dir='/usr/local/'\n" \ + "listen_pg_addr='0.0.0.0:6400'\n" \ + "listen_http_addr='0.0.0.0:9898'\n" \ + > /data/.neon/pageserver.toml && \ + chown -R neon:neon /data/.neon # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. @@ -110,3 +111,6 @@ VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 + +CMD /usr/local/bin/pageserver -D /data/.neon + diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index e3d1d0e110..ba4f98d945 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -25,6 +25,7 @@ use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use utils::auth::{Claims, Scope}; +use utils::id::NodeId; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, @@ -74,6 +75,10 @@ impl PageServerNode { } } + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { + toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + } + fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, @@ -186,6 +191,19 @@ impl PageServerNode { .write_all(config.to_string().as_bytes()) .context("write pageserver toml")?; drop(config_file); + + let identity_file_path = datadir.join("identity.toml"); + let mut identity_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(identity_file_path) + .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?; + let identity_toml = self.pageserver_make_identity_toml(node_id); + identity_file + .write_all(identity_toml.to_string().as_bytes()) + .context("write identity toml")?; + drop(identity_toml); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index f646e36f59..33455e458a 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -33,7 +33,7 @@ echo $result | jq . generate_id timeline_id PARAMS=( - -sb + -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 5503b6611a..6e15fdbe0d 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -31,25 +31,14 @@ services: restart: always image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - - BROKER_ENDPOINT='http://storage_broker:50051' - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 6400:6400 # pg protocol handler - 9898:9898 # http endpoints - entrypoint: - - "/bin/sh" - - "-c" - command: - - "/usr/local/bin/pageserver -D /data/.neon/ - -c \"broker_endpoint=$$BROKER_ENDPOINT\" - -c \"listen_pg_addr='0.0.0.0:6400'\" - -c \"listen_http_addr='0.0.0.0:9898'\" - -c \"remote_storage={endpoint='http://minio:9000', - bucket_name='neon', - bucket_region='eu-north-1', - prefix_in_bucket='/pageserver/'}\"" + volumes: + - ./pageserver_config:/data/.neon/ depends_on: - storage_broker - minio_create_buckets diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml new file mode 100644 index 0000000000..20121327c7 --- /dev/null +++ b/docker-compose/pageserver_config/identity.toml @@ -0,0 +1 @@ +id=1234 diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml new file mode 100644 index 0000000000..76935453b6 --- /dev/null +++ b/docker-compose/pageserver_config/pageserver.toml @@ -0,0 +1,5 @@ +broker_endpoint='http://storage_broker:50051' +pg_distrib_dir='/usr/local/' +listen_pg_addr='0.0.0.0:6400' +listen_http_addr='0.0.0.0:9898' +remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 978b81d498..db27a77ec6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,17 +2,18 @@ //! Main entry point for the Page Server executable. +use std::env; use std::env::{var, VarError}; use std::io::Read; use std::sync::Arc; use std::time::Duration; -use std::{env, ops::ControlFlow, str::FromStr}; use anyhow::{anyhow, Context}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::config::PageserverIdentity; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; @@ -29,7 +30,7 @@ use tracing::*; use metrics::set_build_info_metric; use pageserver::{ - config::{defaults::*, PageServerConf}, + config::PageServerConf, context::{DownloadBehavior, RequestContext}, deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, @@ -88,18 +89,13 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{workdir}'"))?; let cfg_file_path = workdir.join("pageserver.toml"); + let identity_file_path = workdir.join("identity.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { - ControlFlow::Continue(conf) => conf, - ControlFlow::Break(()) => { - info!("Pageserver config init successful"); - return Ok(()); - } - }; + let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -154,70 +150,55 @@ fn main() -> anyhow::Result<()> { } fn initialize_config( + identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, - arg_matches: clap::ArgMatches, workdir: &Utf8Path, -) -> anyhow::Result> { - let init = arg_matches.get_flag("init"); - - let file_contents: Option = match std::fs::File::open(cfg_file_path) { +) -> anyhow::Result<&'static PageServerConf> { + // The deployment orchestrator writes out an indentity file containing the node id + // for all pageservers. This file is the source of truth for the node id. In order + // to allow for rolling back pageserver releases, the node id is also included in + // the pageserver config that the deployment orchestrator writes to disk for the pageserver. + // A rolled back version of the pageserver will get the node id from the pageserver.toml + // config file. + let identity = match std::fs::File::open(identity_file_path) { Ok(mut f) => { - if init { - anyhow::bail!("config file already exists: {cfg_file_path}"); + let md = f.metadata().context("stat config file")?; + if !md.is_file() { + anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); } + + let mut s = String::new(); + f.read_to_string(&mut s).context("read identity file")?; + toml_edit::de::from_str::(&s)? + } + Err(e) => { + anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + } + }; + + let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { let md = f.metadata().context("stat config file")?; if md.is_file() { let mut s = String::new(); f.read_to_string(&mut s).context("read config file")?; - Some(s.parse().context("parse config file toml")?) + s.parse().context("parse config file toml")? } else { anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); } } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, Err(e) => { anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } }; - let mut effective_config = file_contents.unwrap_or_else(|| { - DEFAULT_CONFIG_FILE - .parse() - .expect("unit tests ensure this works") - }); - - // Patch with overrides from the command line - if let Some(values) = arg_matches.get_many::("config-override") { - for option_line in values { - let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!("Option '{option_line}' could not be parsed as a toml document") - })?; - - for (key, item) in doc.iter() { - effective_config.insert(key, item.clone()); - } - } - } - - debug!("Resulting toml: {effective_config}"); + debug!("Using pageserver toml: {config}"); // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(&effective_config, workdir) + let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) .context("Failed to parse pageserver configuration")?; - if init { - info!("Writing pageserver config to '{cfg_file_path}'"); - - std::fs::write(cfg_file_path, effective_config.to_string()) - .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; - info!("Config successfully written to '{cfg_file_path}'") - } - - Ok(if init { - ControlFlow::Break(()) - } else { - ControlFlow::Continue(Box::leak(Box::new(conf))) - }) + Ok(Box::leak(Box::new(conf))) } struct WaitForPhaseResult { @@ -710,28 +691,12 @@ fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) - .arg( - Arg::new("init") - .long("init") - .action(ArgAction::SetTrue) - .help("Initialize pageserver with all given config overrides"), - ) .arg( Arg::new("workdir") .short('D') .long("workdir") .help("Working directory for the pageserver"), ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .long("config-override") - .short('c') - .num_args(1) - .action(ArgAction::Append) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ - Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6a78d126cf..20e78b1d85 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,8 +7,8 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde; use serde::de::IntoDeserializer; +use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -406,6 +406,13 @@ struct PageServerConfigBuilder { } impl PageServerConfigBuilder { + fn new(node_id: NodeId) -> Self { + let mut this = Self::default(); + this.id(node_id); + + this + } + #[inline(always)] fn default_values() -> Self { use self::BuilderValue::*; @@ -881,8 +888,12 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::default(); + pub fn parse_and_validate( + node_id: NodeId, + toml: &Document, + workdir: &Utf8Path, + ) -> anyhow::Result { + let mut builder = PageServerConfigBuilder::new(node_id); builder.workdir(workdir.to_owned()); let mut t_conf = TenantConfOpt::default(); @@ -913,7 +924,8 @@ impl PageServerConf { "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } - "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), + "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth + // Logging is not set up yet, so we can't do it. "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -1090,6 +1102,12 @@ impl PageServerConf { } } +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PageserverIdentity { + pub id: NodeId, +} + // Helper functions to parse a toml Item fn parse_toml_string(name: &str, item: &Item) -> Result { @@ -1259,7 +1277,7 @@ background_task_maximum_delay = '334 s' ); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1341,7 +1359,7 @@ background_task_maximum_delay = '334 s' ); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1431,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for the local FS"); assert_eq!( parsed_remote_storage_config, @@ -1492,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for S3"); assert_eq!( parsed_remote_storage_config, @@ -1576,7 +1596,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); assert_eq!( @@ -1592,7 +1612,11 @@ threshold = "20m" .evictions_low_residence_duration_metric_threshold, Duration::from_secs(20 * 60) ); - assert_eq!(conf.id, NodeId(222)); + + // Assert that the node id provided by the indentity file (threaded + // through the call to [`PageServerConf::parse_and_validate`] is + // used. + assert_eq!(conf.id, NodeId(333)); assert_eq!( conf.disk_usage_based_eviction, Some(DiskUsageEvictionTaskConfig { @@ -1637,7 +1661,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap(); + let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); match &conf.default_tenant_conf.eviction_policy { EvictionPolicy::OnlyImitiate(t) => { @@ -1656,7 +1680,7 @@ threshold = "20m" remote_storage = {} "#; let doc = toml_edit::Document::from_str(input).unwrap(); - let err = PageServerConf::parse_and_validate(&doc, &workdir) + let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); assert!(format!("{err}").contains("remote_storage"), "{err}"); } diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index caeae7fd15..28dbf40bed 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,8 +1,5 @@ -import subprocess -from pathlib import Path from typing import Optional -import toml from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.utils import wait_until -def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): - """ - NB: The neon_local doesn't use `--init` mode anymore, but our production - deployment still does => https://github.com/neondatabase/aws/pull/1322 - """ - workdir = neon_simple_env.pageserver.workdir - pageserver_config = workdir / "pageserver.toml" - pageserver_bin = neon_binpath / "pageserver" - - def run_pageserver(args): - return subprocess.run( - [str(pageserver_bin), "-D", str(workdir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - neon_simple_env.pageserver.stop() - - with open(neon_simple_env.pageserver.config_toml_path, "r") as f: - ps_config = toml.load(f) - - required_config_keys = [ - "pg_distrib_dir", - "listen_pg_addr", - "listen_http_addr", - "pg_auth_type", - "http_auth_type", - # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 - # "tenant_config", - ] - required_config_overrides = [ - f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys - ] - - pageserver_config.unlink() - - bad_init = run_pageserver(["--init", *required_config_overrides]) - assert ( - bad_init.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert 'missing config value "id"' in bad_init.stderr - assert not pageserver_config.exists(), "config file should not be created after init error" - - good_init_cmd = [ - "--init", - f"--config-override=id={ps_config['id']}", - *required_config_overrides, - ] - completed_init = run_pageserver(good_init_cmd) - assert ( - completed_init.returncode == 0 - ), "pageserver should be able to create a new config with the node id given" - assert pageserver_config.exists(), "config file should be created successfully" - - bad_reinit = run_pageserver(good_init_cmd) - assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" - assert "config file already exists" in bad_reinit.stderr - - def check_client(env: NeonEnv, client: PageserverHttpClient): pg_version = env.pg_version initial_tenant = env.initial_tenant