Merge pull request #8468 from neondatabase/rc/2024-07-23-manual

Storage release 2024-07-23

We did not deploy yesterday's
* https://github.com/neondatabase/neon/pull/8451
because of CICD troubles with pre-prod.

Also, it was missing

* https://github.com/neondatabase/neon/pull/7766

which is low-risk and unblocks more cleanup work that would otherwise have to wait until after next week's release.

So, this PR cherry-picks #7766 and creates a new storage release.

Compute will release separately later this week.

Back pointer to Slack thread: https://neondb.slack.com/archives/C03H1K0PGKH/p1721650191019099
This commit is contained in:
Christian Schwarz
2024-07-24 12:02:14 +02:00
committed by GitHub
9 changed files with 117 additions and 175 deletions

View File

@@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \
-c "broker_endpoint='http://storage_broker:50051'" \
-c "pg_distrib_dir='/usr/local/'" \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"
RUN mkdir -p /data/.neon/ && \
echo "id=1234" > "/data/.neon/identity.toml" && \
echo "broker_endpoint='http://storage_broker:50051'\n" \
"pg_distrib_dir='/usr/local/'\n" \
"listen_pg_addr='0.0.0.0:6400'\n" \
"listen_http_addr='0.0.0.0:9898'\n" \
> /data/.neon/pageserver.toml && \
chown -R neon:neon /data/.neon
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
@@ -110,3 +111,6 @@ VOLUME ["/data"]
USER neon
EXPOSE 6400
EXPOSE 9898
CMD /usr/local/bin/pageserver -D /data/.neon

View File

@@ -25,6 +25,7 @@ use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::auth::{Claims, Scope};
use utils::id::NodeId;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
@@ -74,6 +75,10 @@ impl PageServerNode {
}
}
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
}
fn pageserver_init_make_toml(
&self,
conf: NeonLocalInitPageserverConf,
@@ -186,6 +191,19 @@ impl PageServerNode {
.write_all(config.to_string().as_bytes())
.context("write pageserver toml")?;
drop(config_file);
let identity_file_path = datadir.join("identity.toml");
let mut identity_file = std::fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(identity_file_path)
.with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
let identity_toml = self.pageserver_make_identity_toml(node_id);
identity_file
.write_all(identity_toml.to_string().as_bytes())
.context("write identity toml")?;
drop(identity_toml);
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with

View File

@@ -33,7 +33,7 @@ echo $result | jq .
generate_id timeline_id
PARAMS=(
-sb
-sbf
-X POST
-H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"

View File

@@ -31,25 +31,14 @@ services:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://storage_broker:50051'
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 6400:6400 # pg protocol handler
- 9898:9898 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "/usr/local/bin/pageserver -D /data/.neon/
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
-c \"listen_pg_addr='0.0.0.0:6400'\"
-c \"listen_http_addr='0.0.0.0:9898'\"
-c \"remote_storage={endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/pageserver/'}\""
volumes:
- ./pageserver_config:/data/.neon/
depends_on:
- storage_broker
- minio_create_buckets

View File

@@ -0,0 +1 @@
id=1234

View File

@@ -0,0 +1,5 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }

View File

@@ -2,17 +2,18 @@
//! Main entry point for the Page Server executable.
use std::env;
use std::env::{var, VarError};
use std::io::Read;
use std::sync::Arc;
use std::time::Duration;
use std::{env, ops::ControlFlow, str::FromStr};
use anyhow::{anyhow, Context};
use camino::Utf8Path;
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::config::PageserverIdentity;
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
@@ -25,7 +26,7 @@ use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
@@ -84,18 +85,13 @@ fn main() -> anyhow::Result<()> {
.with_context(|| format!("Error opening workdir '{workdir}'"))?;
let cfg_file_path = workdir.join("pageserver.toml");
let identity_file_path = workdir.join("identity.toml");
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir)
.with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
ControlFlow::Continue(conf) => conf,
ControlFlow::Break(()) => {
info!("Pageserver config init successful");
return Ok(());
}
};
let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
// Initialize logging.
//
@@ -150,70 +146,55 @@ fn main() -> anyhow::Result<()> {
}
fn initialize_config(
identity_file_path: &Utf8Path,
cfg_file_path: &Utf8Path,
arg_matches: clap::ArgMatches,
workdir: &Utf8Path,
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
let init = arg_matches.get_flag("init");
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
) -> anyhow::Result<&'static PageServerConf> {
// The deployment orchestrator writes out an indentity file containing the node id
// for all pageservers. This file is the source of truth for the node id. In order
// to allow for rolling back pageserver releases, the node id is also included in
// the pageserver config that the deployment orchestrator writes to disk for the pageserver.
// A rolled back version of the pageserver will get the node id from the pageserver.toml
// config file.
let identity = match std::fs::File::open(identity_file_path) {
Ok(mut f) => {
if init {
anyhow::bail!("config file already exists: {cfg_file_path}");
let md = f.metadata().context("stat config file")?;
if !md.is_file() {
anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
}
let mut s = String::new();
f.read_to_string(&mut s).context("read identity file")?;
toml_edit::de::from_str::<PageserverIdentity>(&s)?
}
Err(e) => {
anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
}
};
let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
Ok(mut f) => {
let md = f.metadata().context("stat config file")?;
if md.is_file() {
let mut s = String::new();
f.read_to_string(&mut s).context("read config file")?;
Some(s.parse().context("parse config file toml")?)
s.parse().context("parse config file toml")?
} else {
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
}
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
Err(e) => {
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
}
};
let mut effective_config = file_contents.unwrap_or_else(|| {
DEFAULT_CONFIG_FILE
.parse()
.expect("unit tests ensure this works")
});
// Patch with overrides from the command line
if let Some(values) = arg_matches.get_many::<String>("config-override") {
for option_line in values {
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
format!("Option '{option_line}' could not be parsed as a toml document")
})?;
for (key, item) in doc.iter() {
effective_config.insert(key, item.clone());
}
}
}
debug!("Resulting toml: {effective_config}");
debug!("Using pageserver toml: {config}");
// Construct the runtime representation
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
.context("Failed to parse pageserver configuration")?;
if init {
info!("Writing pageserver config to '{cfg_file_path}'");
std::fs::write(cfg_file_path, effective_config.to_string())
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
info!("Config successfully written to '{cfg_file_path}'")
}
Ok(if init {
ControlFlow::Break(())
} else {
ControlFlow::Continue(Box::leak(Box::new(conf)))
})
Ok(Box::leak(Box::new(conf)))
}
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -734,28 +715,12 @@ fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")
.version(version())
.arg(
Arg::new("init")
.long("init")
.action(ArgAction::SetTrue)
.help("Initialize pageserver with all given config overrides"),
)
.arg(
Arg::new("workdir")
.short('D')
.long("workdir")
.help("Working directory for the pageserver"),
)
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
.long("config-override")
.short('c')
.num_args(1)
.action(ArgAction::Append)
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
)
.arg(
Arg::new("enabled-features")
.long("enabled-features")

View File

@@ -7,8 +7,8 @@
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde;
use serde::de::IntoDeserializer;
use serde::{self, Deserialize};
use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
@@ -406,6 +406,13 @@ struct PageServerConfigBuilder {
}
impl PageServerConfigBuilder {
fn new(node_id: NodeId) -> Self {
let mut this = Self::default();
this.id(node_id);
this
}
#[inline(always)]
fn default_values() -> Self {
use self::BuilderValue::*;
@@ -881,8 +888,12 @@ impl PageServerConf {
/// validating the input and failing on errors.
///
/// This leaves any options not present in the file in the built-in defaults.
pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::default();
pub fn parse_and_validate(
node_id: NodeId,
toml: &Document,
workdir: &Utf8Path,
) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::new(node_id);
builder.workdir(workdir.to_owned());
let mut t_conf = TenantConfOpt::default();
@@ -913,7 +924,8 @@ impl PageServerConf {
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
// Logging is not set up yet, so we can't do it.
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
@@ -1090,6 +1102,12 @@ impl PageServerConf {
}
}
#[derive(Deserialize)]
#[serde(deny_unknown_fields)]
pub struct PageserverIdentity {
pub id: NodeId,
}
// Helper functions to parse a toml Item
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1259,7 +1277,7 @@ background_task_maximum_delay = '334 s'
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
assert_eq!(
@@ -1341,7 +1359,7 @@ background_task_maximum_delay = '334 s'
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
assert_eq!(
@@ -1431,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}'
let toml = config_string.parse()?;
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for the local FS");
let parsed_remote_storage_config =
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for the local FS");
assert_eq!(
parsed_remote_storage_config,
@@ -1492,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}'
let toml = config_string.parse()?;
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for S3");
let parsed_remote_storage_config =
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for S3");
assert_eq!(
parsed_remote_storage_config,
@@ -1576,7 +1596,7 @@ threshold = "20m"
"#,
);
let toml: Document = pageserver_conf_toml.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
assert_eq!(
@@ -1592,7 +1612,11 @@ threshold = "20m"
.evictions_low_residence_duration_metric_threshold,
Duration::from_secs(20 * 60)
);
assert_eq!(conf.id, NodeId(222));
// Assert that the node id provided by the indentity file (threaded
// through the call to [`PageServerConf::parse_and_validate`] is
// used.
assert_eq!(conf.id, NodeId(333));
assert_eq!(
conf.disk_usage_based_eviction,
Some(DiskUsageEvictionTaskConfig {
@@ -1637,7 +1661,7 @@ threshold = "20m"
"#,
);
let toml: Document = pageserver_conf_toml.parse().unwrap();
let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
match &conf.default_tenant_conf.eviction_policy {
EvictionPolicy::OnlyImitiate(t) => {
@@ -1656,7 +1680,7 @@ threshold = "20m"
remote_storage = {}
"#;
let doc = toml_edit::Document::from_str(input).unwrap();
let err = PageServerConf::parse_and_validate(&doc, &workdir)
let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
assert!(format!("{err}").contains("remote_storage"), "{err}");
}

View File

@@ -1,8 +1,5 @@
import subprocess
from pathlib import Path
from typing import Optional
import toml
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.neon_fixtures import (
DEFAULT_BRANCH_NAME,
@@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient
from fixtures.utils import wait_until
def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
"""
NB: The neon_local doesn't use `--init` mode anymore, but our production
deployment still does => https://github.com/neondatabase/aws/pull/1322
"""
workdir = neon_simple_env.pageserver.workdir
pageserver_config = workdir / "pageserver.toml"
pageserver_bin = neon_binpath / "pageserver"
def run_pageserver(args):
return subprocess.run(
[str(pageserver_bin), "-D", str(workdir), *args],
check=False,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
neon_simple_env.pageserver.stop()
with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
ps_config = toml.load(f)
required_config_keys = [
"pg_distrib_dir",
"listen_pg_addr",
"listen_http_addr",
"pg_auth_type",
"http_auth_type",
# TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
# "tenant_config",
]
required_config_overrides = [
f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
]
pageserver_config.unlink()
bad_init = run_pageserver(["--init", *required_config_overrides])
assert (
bad_init.returncode == 1
), "pageserver should not be able to init new config without the node id"
assert 'missing config value "id"' in bad_init.stderr
assert not pageserver_config.exists(), "config file should not be created after init error"
good_init_cmd = [
"--init",
f"--config-override=id={ps_config['id']}",
*required_config_overrides,
]
completed_init = run_pageserver(good_init_cmd)
assert (
completed_init.returncode == 0
), "pageserver should be able to create a new config with the node id given"
assert pageserver_config.exists(), "config file should be created successfully"
bad_reinit = run_pageserver(good_init_cmd)
assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
assert "config file already exists" in bad_reinit.stderr
def check_client(env: NeonEnv, client: PageserverHttpClient):
pg_version = env.pg_version
initial_tenant = env.initial_tenant