From 02c1c351dc4fe7ffb99d9b3e69e10c837f6548f3 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Dec 2022 15:42:59 +0200 Subject: [PATCH] Create initial timeline without remote storage (#3077) Removes the race during pageserver initial timeline creation that lead to partial layer uploads. This race is only reproducible in test code, we do not create initial timelines in cloud (yet, at least), but still nice to remove the non-deterministic behavior. --- control_plane/src/background_process.rs | 12 +- control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/pageserver.rs | 147 +++++++++++++++++------- libs/remote_storage/src/lib.rs | 9 +- pageserver/src/config.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 9 +- 6 files changed, 126 insertions(+), 55 deletions(-) diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1a5ac1e2fe..8909e27c94 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -51,21 +51,21 @@ pub enum InitialPidFile<'t> { } /// Start a background child process using the parameters given. -pub fn start_process< - F, - S: AsRef, - EI: IntoIterator, // Not generic AsRef, otherwise empty `envs` prevents type inference ->( +pub fn start_process( process_name: &str, datadir: &Path, command: &Path, - args: &[S], + args: AI, envs: EI, initial_pid_file: InitialPidFile, process_status_check: F, ) -> anyhow::Result where F: Fn() -> anyhow::Result, + AI: IntoIterator, + A: AsRef, + // Not generic AsRef, otherwise empty `envs` prevents type inference + EI: IntoIterator, { let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f059d535e..f0c3b983f0 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -341,7 +341,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .get_many::("pageserver-config-override") .into_iter() .flatten() - .map(|s| s.as_str()) + .map(String::as_str) .collect() } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 51e540e39c..3575e75db9 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,9 +1,10 @@ +use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; use std::num::NonZeroU64; -use std::path::{Path, PathBuf}; -use std::process::Child; +use std::path::PathBuf; +use std::process::{Child, Command}; use std::{io, result}; use anyhow::{bail, ensure, Context}; @@ -129,6 +130,8 @@ impl PageServerNode { overrides } + /// Initializes a pageserver node by creating its config with the overrides provided, + /// and creating an initial tenant and timeline afterwards. pub fn initialize( &self, create_tenant: Option, @@ -136,11 +139,28 @@ impl PageServerNode { config_overrides: &[&str], pg_version: u32, ) -> anyhow::Result { + // First, run `pageserver --init` and wait for it to write a config into FS and exit. + self.pageserver_init(config_overrides).with_context(|| { + format!( + "Failed to run init for pageserver node {}", + self.env.pageserver.id, + ) + })?; + + // Then, briefly start it fully to run HTTP commands on it, + // to create initial tenant and timeline. + // We disable the remote storage, since we stop pageserver right after the timeline creation, + // hence most of the uploads will either aborted or not started: no point to start them at all. + let disabled_remote_storage_override = "remote_storage={}"; let mut pageserver_process = self - .start_node(config_overrides, &self.env.base_data_dir, true) + .start_node( + &[disabled_remote_storage_override], + // Previous overrides will be taken from the config created before, don't overwrite them. + false, + ) .with_context(|| { format!( - "Failed to start a process for pageserver {}", + "Failed to start a process for pageserver node {}", self.env.pageserver.id, ) })?; @@ -201,55 +221,73 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, &self.repo_path(), false) + self.start_node(config_overrides, false) } - fn start_node( - &self, - config_overrides: &[&str], - datadir: &Path, - update_config: bool, - ) -> anyhow::Result { - let mut overrides = self.neon_local_overrides(); - overrides.extend(config_overrides.iter().map(|&c| c.to_owned())); - - print!( - "Starting pageserver at '{}' in '{}'", + fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + let datadir = self.repo_path(); + let node_id = self.env.pageserver.id; + println!( + "Initializing pageserver node {} at '{}' in {:?}", + node_id, self.pg_connection_config.raw_address(), - datadir.display() + datadir ); io::stdout().flush()?; - let mut args = vec![ - "-D", - datadir.to_str().with_context(|| { - format!("Datadir path {datadir:?} cannot be represented as a unicode string") - })?, - ]; + let datadir_path_str = datadir.to_str().with_context(|| { + format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") + })?; + let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); + args.push(Cow::Borrowed("--init")); + let init_output = Command::new(&self.env.pageserver_bin()) + .args(args.iter().map(Cow::as_ref)) + .envs(self.pageserver_env_variables()?) + .output() + .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; + + anyhow::ensure!( + init_output.status.success(), + "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", + node_id, + String::from_utf8_lossy(&init_output.stdout), + String::from_utf8_lossy(&init_output.stderr), + ); + + Ok(()) + } + + fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result { + let mut overrides = self.neon_local_overrides(); + overrides.extend(config_overrides.iter().map(|&c| c.to_owned())); + + let datadir = self.repo_path(); + print!( + "Starting pageserver node {} at '{}' in {:?}", + self.env.pageserver.id, + self.pg_connection_config.raw_address(), + datadir + ); + io::stdout().flush()?; + + let datadir_path_str = datadir.to_str().with_context(|| { + format!( + "Cannot start pageserver node {} in path that has no string representation: {:?}", + self.env.pageserver.id, datadir, + ) + })?; + let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); if update_config { - args.push("--update-config"); + args.push(Cow::Borrowed("--update-config")); } - for config_override in &overrides { - args.extend(["-c", config_override]); - } - - let envs = if self.env.pageserver.auth_type != AuthType::Trust { - // Generate a token to connect from the pageserver to a safekeeper - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - vec![("ZENITH_AUTH_TOKEN".to_owned(), token)] - } else { - vec![] - }; background_process::start_process( "pageserver", - datadir, + &datadir, &self.env.pageserver_bin(), - &args, - envs, + args.iter().map(Cow::as_ref), + self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(&self.pid_file()), || match self.check_status() { Ok(()) => Ok(true), @@ -259,6 +297,35 @@ impl PageServerNode { ) } + fn pageserver_basic_args<'a>( + &self, + config_overrides: &'a [&'a str], + datadir_path_str: &'a str, + ) -> Vec> { + let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; + + let mut overrides = self.neon_local_overrides(); + overrides.extend(config_overrides.iter().map(|&c| c.to_owned())); + for config_override in overrides { + args.push(Cow::Borrowed("-c")); + args.push(Cow::Owned(config_override)); + } + + args + } + + fn pageserver_env_variables(&self) -> anyhow::Result> { + Ok(if self.env.pageserver.auth_type != AuthType::Trust { + // Generate a token to connect from the pageserver to a safekeeper + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + vec![("ZENITH_AUTH_TOKEN".to_owned(), token)] + } else { + Vec::new() + }) + } + /// /// Stop the server. /// diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 3bbffd6941..28858fcbab 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -272,7 +272,7 @@ impl Debug for S3Config { } impl RemoteStorageConfig { - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { let local_path = toml.get("local_path"); let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); @@ -296,7 +296,8 @@ impl RemoteStorageConfig { .context("Failed to parse 'concurrency_limit' as a positive integer")?; let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled + (None, None, None) => return Ok(None), (_, Some(_), None) => { bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") } @@ -322,11 +323,11 @@ impl RemoteStorageConfig { (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), }; - Ok(RemoteStorageConfig { + Ok(Some(RemoteStorageConfig { max_concurrent_syncs, max_sync_errors, storage, - }) + })) } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c07907a1c9..48e9f32276 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -524,7 +524,7 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) + builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 92cd5db203..cab5053b5b 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -318,12 +318,15 @@ fn set_id(workdir: &Path, given_id: Option) -> Result { } // Parse RemoteStorage from TOML table. -fn parse_remote_storage(storage_conf: &str) -> Result { +fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let storage_conf_toml = format!("remote_storage = {storage_conf}"); let parsed_toml = storage_conf_toml.parse::()?; // parse let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml) + RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { + // XXX: Don't print the original toml here, there might be some sensitive data + parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") + }) } #[test]