mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
- Add ComputeSpec flag `offload_lfc_interval_seconds` controlling whether LFC should be offloaded to endpoint storage. Default value (None) means "don't offload". - Add glue code around it for `neon_local` and integration tests. - Add `autoprewarm` mode for `test_lfc_prewarm` testing `offload_lfc_interval_seconds` and `autoprewarm` flags in conjunction. - Rename `compute_ctl_lfc_prewarm_requests_total` and `compute_ctl_lfc_offload_requests_total` to `compute_ctl_lfc_prewarms_total` and `compute_ctl_lfc_offloads_total` to reflect we count prewarms and offloads, not `compute_ctl` requests of those. Don't count request in metrics if there is a prewarm/offload already ongoing. https://github.com/neondatabase/cloud/issues/19011 Resolves: https://github.com/neondatabase/cloud/issues/30770
1124 lines
44 KiB
Rust
1124 lines
44 KiB
Rust
//! Code to manage compute endpoints
|
|
//!
|
|
//! In the local test environment, the data for each endpoint is stored in
|
|
//!
|
|
//! ```text
|
|
//! .neon/endpoints/<endpoint id>
|
|
//! ```
|
|
//!
|
|
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
|
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
|
//! when the endpoint is created, and doesn't change afterwards.
|
|
//!
|
|
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
|
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
|
//! the basebackup from the pageserver to initialize the data directory, and
|
|
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
|
//! until it exits.
|
|
//!
|
|
//! When an endpoint is created, a `postgresql.conf` file is also created in
|
|
//! the endpoint's directory. The file can be modified before starting PostgreSQL.
|
|
//! However, the `postgresql.conf` file in the endpoint directory is not used directly
|
|
//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
|
|
//! copy of it in the data directory.
|
|
//!
|
|
//! Directory contents:
|
|
//!
|
|
//! ```text
|
|
//! .neon/endpoints/main/
|
|
//! compute.log - log output of `compute_ctl` and `postgres`
|
|
//! endpoint.json - serialized `EndpointConf` struct
|
|
//! postgresql.conf - postgresql settings
|
|
//! config.json - passed to `compute_ctl`
|
|
//! pgdata/
|
|
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
|
//! zenith.signal
|
|
//! <other PostgreSQL files>
|
|
//! ```
|
|
//!
|
|
use std::collections::BTreeMap;
|
|
use std::fmt::Display;
|
|
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
|
use std::path::PathBuf;
|
|
use std::process::Command;
|
|
use std::str::FromStr;
|
|
use std::sync::Arc;
|
|
use std::time::{Duration, Instant};
|
|
|
|
use anyhow::{Context, Result, anyhow, bail};
|
|
use base64::Engine;
|
|
use base64::prelude::BASE64_URL_SAFE_NO_PAD;
|
|
use compute_api::requests::{
|
|
COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
|
|
};
|
|
use compute_api::responses::{
|
|
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
|
|
TlsConfig,
|
|
};
|
|
use compute_api::spec::{
|
|
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
|
PgIdent, RemoteExtSpec, Role,
|
|
};
|
|
use jsonwebtoken::jwk::{
|
|
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
|
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
|
|
};
|
|
use nix::sys::signal::{Signal, kill};
|
|
use pageserver_api::shard::ShardStripeSize;
|
|
use pem::Pem;
|
|
use reqwest::header::CONTENT_TYPE;
|
|
use safekeeper_api::PgMajorVersion;
|
|
use safekeeper_api::membership::SafekeeperGeneration;
|
|
use serde::{Deserialize, Serialize};
|
|
use sha2::{Digest, Sha256};
|
|
use spki::der::Decode;
|
|
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
|
use tracing::debug;
|
|
use url::Host;
|
|
use utils::id::{NodeId, TenantId, TimelineId};
|
|
|
|
use crate::local_env::LocalEnv;
|
|
use crate::postgresql_conf::PostgresConf;
|
|
|
|
// contents of a endpoint.json file
|
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
|
pub struct EndpointConf {
|
|
endpoint_id: String,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
mode: ComputeMode,
|
|
pg_port: u16,
|
|
external_http_port: u16,
|
|
internal_http_port: u16,
|
|
pg_version: PgMajorVersion,
|
|
grpc: bool,
|
|
skip_pg_catalog_updates: bool,
|
|
reconfigure_concurrency: usize,
|
|
drop_subscriptions_before_start: bool,
|
|
features: Vec<ComputeFeature>,
|
|
cluster: Option<Cluster>,
|
|
compute_ctl_config: ComputeCtlConfig,
|
|
}
|
|
|
|
//
|
|
// ComputeControlPlane
|
|
//
|
|
pub struct ComputeControlPlane {
|
|
base_port: u16,
|
|
|
|
// endpoint ID is the key
|
|
pub endpoints: BTreeMap<String, Arc<Endpoint>>,
|
|
|
|
env: LocalEnv,
|
|
}
|
|
|
|
impl ComputeControlPlane {
|
|
// Load current endpoints from the endpoints/ subdirectories
|
|
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
|
let mut endpoints = BTreeMap::default();
|
|
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
|
|
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
|
|
{
|
|
let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env);
|
|
let ep = match ep_res {
|
|
Ok(ep) => ep,
|
|
Err(e) => match e.downcast::<std::io::Error>() {
|
|
Ok(e) => {
|
|
// A parallel task could delete an endpoint while we have just scanned the directory
|
|
if e.kind() == std::io::ErrorKind::NotFound {
|
|
continue;
|
|
} else {
|
|
Err(e)?
|
|
}
|
|
}
|
|
Err(e) => Err(e)?,
|
|
},
|
|
};
|
|
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
|
|
}
|
|
|
|
Ok(ComputeControlPlane {
|
|
base_port: 55431,
|
|
endpoints,
|
|
env,
|
|
})
|
|
}
|
|
|
|
fn get_port(&mut self) -> u16 {
|
|
1 + self
|
|
.endpoints
|
|
.values()
|
|
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port()))
|
|
.max()
|
|
.unwrap_or(self.base_port)
|
|
}
|
|
|
|
/// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
|
|
/// from the production control plane.
|
|
fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
|
|
let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
|
|
let public_key = spki.subject_public_key.raw_bytes();
|
|
|
|
let mut hasher = Sha256::new();
|
|
hasher.update(public_key);
|
|
let key_hash = hasher.finalize();
|
|
|
|
Ok(JwkSet {
|
|
keys: vec![Jwk {
|
|
common: CommonParameters {
|
|
public_key_use: Some(PublicKeyUse::Signature),
|
|
key_operations: Some(vec![KeyOperations::Verify]),
|
|
key_algorithm: Some(KeyAlgorithm::EdDSA),
|
|
key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)),
|
|
x509_url: None::<String>,
|
|
x509_chain: None::<Vec<String>>,
|
|
x509_sha1_fingerprint: None::<String>,
|
|
x509_sha256_fingerprint: None::<String>,
|
|
},
|
|
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
|
|
key_type: OctetKeyPairType::OctetKeyPair,
|
|
curve: EllipticCurve::Ed25519,
|
|
x: BASE64_URL_SAFE_NO_PAD.encode(public_key),
|
|
}),
|
|
}],
|
|
})
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub fn new_endpoint(
|
|
&mut self,
|
|
endpoint_id: &str,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
pg_port: Option<u16>,
|
|
external_http_port: Option<u16>,
|
|
internal_http_port: Option<u16>,
|
|
pg_version: PgMajorVersion,
|
|
mode: ComputeMode,
|
|
grpc: bool,
|
|
skip_pg_catalog_updates: bool,
|
|
drop_subscriptions_before_start: bool,
|
|
) -> Result<Arc<Endpoint>> {
|
|
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
|
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
|
|
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
|
|
let compute_ctl_config = ComputeCtlConfig {
|
|
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
|
|
tls: None::<TlsConfig>,
|
|
};
|
|
let ep = Arc::new(Endpoint {
|
|
endpoint_id: endpoint_id.to_owned(),
|
|
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
|
|
external_http_address: SocketAddr::new(
|
|
IpAddr::from(Ipv4Addr::UNSPECIFIED),
|
|
external_http_port,
|
|
),
|
|
internal_http_address: SocketAddr::new(
|
|
IpAddr::from(Ipv4Addr::LOCALHOST),
|
|
internal_http_port,
|
|
),
|
|
env: self.env.clone(),
|
|
timeline_id,
|
|
mode,
|
|
tenant_id,
|
|
pg_version,
|
|
// We don't setup roles and databases in the spec locally, so we don't need to
|
|
// do catalog updates. Catalog updates also include check availability
|
|
// data creation. Yet, we have tests that check that size and db dump
|
|
// before and after start are the same. So, skip catalog updates,
|
|
// with this we basically test a case of waking up an idle compute, where
|
|
// we also skip catalog updates in the cloud.
|
|
skip_pg_catalog_updates,
|
|
drop_subscriptions_before_start,
|
|
grpc,
|
|
reconfigure_concurrency: 1,
|
|
features: vec![],
|
|
cluster: None,
|
|
compute_ctl_config: compute_ctl_config.clone(),
|
|
});
|
|
|
|
ep.create_endpoint_dir()?;
|
|
std::fs::write(
|
|
ep.endpoint_path().join("endpoint.json"),
|
|
serde_json::to_string_pretty(&EndpointConf {
|
|
endpoint_id: endpoint_id.to_string(),
|
|
tenant_id,
|
|
timeline_id,
|
|
mode,
|
|
external_http_port,
|
|
internal_http_port,
|
|
pg_port,
|
|
pg_version,
|
|
grpc,
|
|
skip_pg_catalog_updates,
|
|
drop_subscriptions_before_start,
|
|
reconfigure_concurrency: 1,
|
|
features: vec![],
|
|
cluster: None,
|
|
compute_ctl_config,
|
|
})?,
|
|
)?;
|
|
std::fs::write(
|
|
ep.endpoint_path().join("postgresql.conf"),
|
|
ep.setup_pg_conf()?.to_string(),
|
|
)?;
|
|
|
|
self.endpoints
|
|
.insert(ep.endpoint_id.clone(), Arc::clone(&ep));
|
|
|
|
Ok(ep)
|
|
}
|
|
|
|
pub fn check_conflicting_endpoints(
|
|
&self,
|
|
mode: ComputeMode,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
) -> Result<()> {
|
|
if matches!(mode, ComputeMode::Primary) {
|
|
// this check is not complete, as you could have a concurrent attempt at
|
|
// creating another primary, both reading the state before checking it here,
|
|
// but it's better than nothing.
|
|
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
|
|
v.tenant_id == tenant_id
|
|
&& v.timeline_id == timeline_id
|
|
&& v.mode == mode
|
|
&& v.status() != EndpointStatus::Stopped
|
|
});
|
|
|
|
if let Some((key, _)) = duplicates.next() {
|
|
bail!(
|
|
"attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."
|
|
);
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
pub struct Endpoint {
|
|
/// used as the directory name
|
|
endpoint_id: String,
|
|
pub tenant_id: TenantId,
|
|
pub timeline_id: TimelineId,
|
|
pub mode: ComputeMode,
|
|
/// If true, the endpoint should use gRPC to communicate with Pageservers.
|
|
pub grpc: bool,
|
|
|
|
// port and address of the Postgres server and `compute_ctl`'s HTTP APIs
|
|
pub pg_address: SocketAddr,
|
|
pub external_http_address: SocketAddr,
|
|
pub internal_http_address: SocketAddr,
|
|
|
|
// postgres major version in the format: 14, 15, etc.
|
|
pg_version: PgMajorVersion,
|
|
|
|
// These are not part of the endpoint as such, but the environment
|
|
// the endpoint runs in.
|
|
pub env: LocalEnv,
|
|
|
|
// Optimizations
|
|
skip_pg_catalog_updates: bool,
|
|
|
|
drop_subscriptions_before_start: bool,
|
|
reconfigure_concurrency: usize,
|
|
// Feature flags
|
|
features: Vec<ComputeFeature>,
|
|
// Cluster settings
|
|
cluster: Option<Cluster>,
|
|
|
|
/// The compute_ctl config for the endpoint's compute.
|
|
compute_ctl_config: ComputeCtlConfig,
|
|
}
|
|
|
|
#[derive(PartialEq, Eq)]
|
|
pub enum EndpointStatus {
|
|
Running,
|
|
Stopped,
|
|
Crashed,
|
|
RunningNoPidfile,
|
|
}
|
|
|
|
impl Display for EndpointStatus {
|
|
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
writer.write_str(match self {
|
|
Self::Running => "running",
|
|
Self::Stopped => "stopped",
|
|
Self::Crashed => "crashed",
|
|
Self::RunningNoPidfile => "running, no pidfile",
|
|
})
|
|
}
|
|
}
|
|
|
|
#[derive(Default, Clone, Copy, clap::ValueEnum)]
|
|
pub enum EndpointTerminateMode {
|
|
#[default]
|
|
/// Use pg_ctl stop -m fast
|
|
Fast,
|
|
/// Use pg_ctl stop -m immediate
|
|
Immediate,
|
|
/// Use /terminate?mode=immediate
|
|
ImmediateTerminate,
|
|
}
|
|
|
|
impl std::fmt::Display for EndpointTerminateMode {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.write_str(match &self {
|
|
EndpointTerminateMode::Fast => "fast",
|
|
EndpointTerminateMode::Immediate => "immediate",
|
|
EndpointTerminateMode::ImmediateTerminate => "immediate-terminate",
|
|
})
|
|
}
|
|
}
|
|
|
|
pub struct EndpointStartArgs {
|
|
pub auth_token: Option<String>,
|
|
pub endpoint_storage_token: String,
|
|
pub endpoint_storage_addr: String,
|
|
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
|
pub safekeepers: Vec<NodeId>,
|
|
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
|
pub remote_ext_base_url: Option<String>,
|
|
pub shard_stripe_size: usize,
|
|
pub create_test_user: bool,
|
|
pub start_timeout: Duration,
|
|
pub autoprewarm: bool,
|
|
pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
|
|
pub dev: bool,
|
|
}
|
|
|
|
impl Endpoint {
|
|
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
|
if !entry.file_type()?.is_dir() {
|
|
anyhow::bail!(
|
|
"Endpoint::from_dir_entry failed: '{}' is not a directory",
|
|
entry.path().display()
|
|
);
|
|
}
|
|
|
|
// parse data directory name
|
|
let fname = entry.file_name();
|
|
let endpoint_id = fname.to_str().unwrap().to_string();
|
|
|
|
// Read the endpoint.json file
|
|
let conf: EndpointConf =
|
|
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
|
|
|
debug!("serialized endpoint conf: {:?}", conf);
|
|
|
|
Ok(Endpoint {
|
|
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
|
|
external_http_address: SocketAddr::new(
|
|
IpAddr::from(Ipv4Addr::UNSPECIFIED),
|
|
conf.external_http_port,
|
|
),
|
|
internal_http_address: SocketAddr::new(
|
|
IpAddr::from(Ipv4Addr::LOCALHOST),
|
|
conf.internal_http_port,
|
|
),
|
|
endpoint_id,
|
|
env: env.clone(),
|
|
timeline_id: conf.timeline_id,
|
|
mode: conf.mode,
|
|
tenant_id: conf.tenant_id,
|
|
pg_version: conf.pg_version,
|
|
grpc: conf.grpc,
|
|
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
|
reconfigure_concurrency: conf.reconfigure_concurrency,
|
|
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
|
|
features: conf.features,
|
|
cluster: conf.cluster,
|
|
compute_ctl_config: conf.compute_ctl_config,
|
|
})
|
|
}
|
|
|
|
fn create_endpoint_dir(&self) -> Result<()> {
|
|
std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
|
|
format!(
|
|
"could not create endpoint directory {}",
|
|
self.endpoint_path().display()
|
|
)
|
|
})
|
|
}
|
|
|
|
// Generate postgresql.conf with default configuration
|
|
fn setup_pg_conf(&self) -> Result<PostgresConf> {
|
|
let mut conf = PostgresConf::new();
|
|
conf.append("max_wal_senders", "10");
|
|
conf.append("wal_log_hints", "off");
|
|
conf.append("max_replication_slots", "10");
|
|
conf.append("hot_standby", "on");
|
|
// Set to 1MB to both exercise getPage requests/LFC, and still have enough room for
|
|
// Postgres to operate. Everything smaller might be not enough for Postgres under load,
|
|
// and can cause errors like 'no unpinned buffers available', see
|
|
// <https://github.com/neondatabase/neon/issues/9956>
|
|
conf.append("shared_buffers", "1MB");
|
|
// Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's
|
|
// batching logic. Set this to 2 so that we exercise the code a bit without letting
|
|
// individual tests do a lot of concurrent work on underpowered test machines
|
|
conf.append("effective_io_concurrency", "2");
|
|
conf.append("fsync", "off");
|
|
conf.append("max_connections", "100");
|
|
conf.append("wal_level", "logical");
|
|
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
|
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
|
conf.append("wal_sender_timeout", "5s");
|
|
conf.append("listen_addresses", &self.pg_address.ip().to_string());
|
|
conf.append("port", &self.pg_address.port().to_string());
|
|
conf.append("wal_keep_size", "0");
|
|
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
|
|
conf.append("restart_after_crash", "off");
|
|
|
|
// Load the 'neon' extension
|
|
conf.append("shared_preload_libraries", "neon");
|
|
|
|
conf.append_line("");
|
|
// Replication-related configurations, such as WAL sending
|
|
match &self.mode {
|
|
ComputeMode::Primary => {
|
|
// Configure backpressure
|
|
// - Replication write lag depends on how fast the walreceiver can process incoming WAL.
|
|
// This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
|
|
// so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
|
|
// Actually latency should be much smaller (better if < 1sec). But we assume that recently
|
|
// updates pages are not requested from pageserver.
|
|
// - Replication flush lag depends on speed of persisting data by checkpointer (creation of
|
|
// delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
|
|
// remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
|
|
// recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
|
|
// - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
|
|
// To be able to restore database in case of pageserver node crash, safekeeper should not
|
|
// remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
|
|
// (if they are not able to upload WAL to S3).
|
|
conf.append("max_replication_write_lag", "15MB");
|
|
conf.append("max_replication_flush_lag", "10GB");
|
|
|
|
if !self.env.safekeepers.is_empty() {
|
|
// Configure Postgres to connect to the safekeepers
|
|
conf.append("synchronous_standby_names", "walproposer");
|
|
|
|
let safekeepers = self
|
|
.env
|
|
.safekeepers
|
|
.iter()
|
|
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
|
|
.collect::<Vec<String>>()
|
|
.join(",");
|
|
conf.append("neon.safekeepers", &safekeepers);
|
|
} else {
|
|
// We only use setup without safekeepers for tests,
|
|
// and don't care about data durability on pageserver,
|
|
// so set more relaxed synchronous_commit.
|
|
conf.append("synchronous_commit", "remote_write");
|
|
|
|
// Configure the node to stream WAL directly to the pageserver
|
|
// This isn't really a supported configuration, but can be useful for
|
|
// testing.
|
|
conf.append("synchronous_standby_names", "pageserver");
|
|
}
|
|
}
|
|
ComputeMode::Static(lsn) => {
|
|
conf.append("recovery_target_lsn", &lsn.to_string());
|
|
}
|
|
ComputeMode::Replica => {
|
|
assert!(!self.env.safekeepers.is_empty());
|
|
|
|
// TODO: use future host field from safekeeper spec
|
|
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
|
// whichever is available.
|
|
let sk_ports = self
|
|
.env
|
|
.safekeepers
|
|
.iter()
|
|
.map(|x| x.get_compute_port().to_string())
|
|
.collect::<Vec<_>>()
|
|
.join(",");
|
|
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
|
|
|
|
let connstr = format!(
|
|
"host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
|
|
sk_hosts,
|
|
sk_ports,
|
|
&self.timeline_id.to_string(),
|
|
&self.tenant_id.to_string(),
|
|
);
|
|
|
|
let slot_name = format!("repl_{}_", self.timeline_id);
|
|
conf.append("primary_conninfo", connstr.as_str());
|
|
conf.append("primary_slot_name", slot_name.as_str());
|
|
conf.append("hot_standby", "on");
|
|
// prefetching of blocks referenced in WAL doesn't make sense for us
|
|
// Neon hot standby ignores pages that are not in the shared_buffers
|
|
if self.pg_version >= PgMajorVersion::PG15 {
|
|
conf.append("recovery_prefetch", "off");
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(conf)
|
|
}
|
|
|
|
pub fn endpoint_path(&self) -> PathBuf {
|
|
self.env.endpoints_path().join(&self.endpoint_id)
|
|
}
|
|
|
|
pub fn pgdata(&self) -> PathBuf {
|
|
self.endpoint_path().join("pgdata")
|
|
}
|
|
|
|
pub fn status(&self) -> EndpointStatus {
|
|
let timeout = Duration::from_millis(300);
|
|
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
|
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
|
|
|
|
match (has_pidfile, can_connect) {
|
|
(true, true) => EndpointStatus::Running,
|
|
(false, false) => EndpointStatus::Stopped,
|
|
(true, false) => EndpointStatus::Crashed,
|
|
(false, true) => EndpointStatus::RunningNoPidfile,
|
|
}
|
|
}
|
|
|
|
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
|
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
|
|
let mut cmd = Command::new(&pg_ctl_path);
|
|
cmd.args(
|
|
[
|
|
&[
|
|
"-D",
|
|
self.pgdata().to_str().unwrap(),
|
|
"-w", //wait till pg_ctl actually does what was asked
|
|
],
|
|
args,
|
|
]
|
|
.concat(),
|
|
)
|
|
.env_clear()
|
|
.env(
|
|
"LD_LIBRARY_PATH",
|
|
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
|
|
)
|
|
.env(
|
|
"DYLD_LIBRARY_PATH",
|
|
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
|
|
);
|
|
|
|
// Pass authentication token used for the connections to pageserver and safekeepers
|
|
if let Some(token) = auth_token {
|
|
cmd.env("NEON_AUTH_TOKEN", token);
|
|
}
|
|
|
|
let pg_ctl = cmd
|
|
.output()
|
|
.context(format!("{} failed", pg_ctl_path.display()))?;
|
|
if !pg_ctl.status.success() {
|
|
anyhow::bail!(
|
|
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
|
pg_ctl.status,
|
|
String::from_utf8_lossy(&pg_ctl.stdout),
|
|
String::from_utf8_lossy(&pg_ctl.stderr),
|
|
);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
|
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
|
|
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
|
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
|
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
|
if send_sigterm {
|
|
kill(pid, Signal::SIGTERM).ok();
|
|
}
|
|
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn read_postgresql_conf(&self) -> Result<String> {
|
|
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
|
// memory. We will include it in the spec file that we pass to
|
|
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
|
// in the data directory.
|
|
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
|
match std::fs::read(&postgresql_conf_path) {
|
|
Ok(content) => Ok(String::from_utf8(content)?),
|
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
|
|
Err(e) => Err(anyhow::Error::new(e).context(format!(
|
|
"failed to read config file in {}",
|
|
postgresql_conf_path.to_str().unwrap()
|
|
))),
|
|
}
|
|
}
|
|
|
|
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
|
pageservers
|
|
.iter()
|
|
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
|
.collect::<Vec<_>>()
|
|
.join(",")
|
|
}
|
|
|
|
/// Map safekeepers ids to the actual connection strings.
|
|
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
|
let mut safekeeper_connstrings = Vec::new();
|
|
if self.mode == ComputeMode::Primary {
|
|
for sk_id in sk_ids {
|
|
let sk = self
|
|
.env
|
|
.safekeepers
|
|
.iter()
|
|
.find(|node| node.id == sk_id)
|
|
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
|
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
|
}
|
|
}
|
|
Ok(safekeeper_connstrings)
|
|
}
|
|
|
|
/// Generate a JWT with the correct claims.
|
|
pub fn generate_jwt(&self, scope: Option<ComputeClaimsScope>) -> Result<String> {
|
|
self.env.generate_auth_token(&ComputeClaims {
|
|
audience: match scope {
|
|
Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]),
|
|
_ => None,
|
|
},
|
|
compute_id: match scope {
|
|
Some(ComputeClaimsScope::Admin) => None,
|
|
_ => Some(self.endpoint_id.clone()),
|
|
},
|
|
scope,
|
|
})
|
|
}
|
|
|
|
pub async fn start(&self, args: EndpointStartArgs) -> Result<()> {
|
|
if self.status() == EndpointStatus::Running {
|
|
anyhow::bail!("The endpoint is already running");
|
|
}
|
|
|
|
let postgresql_conf = self.read_postgresql_conf()?;
|
|
|
|
// We always start the compute node from scratch, so if the Postgres
|
|
// data dir exists from a previous launch, remove it first.
|
|
if self.pgdata().exists() {
|
|
std::fs::remove_dir_all(self.pgdata())?;
|
|
}
|
|
|
|
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
|
assert!(!pageserver_connstring.is_empty());
|
|
|
|
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
|
|
|
// check for file remote_extensions_spec.json
|
|
// if it is present, read it and pass to compute_ctl
|
|
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
|
|
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
|
|
let remote_extensions: Option<RemoteExtSpec>;
|
|
|
|
if let Ok(spec_file) = remote_extensions_spec {
|
|
remote_extensions = serde_json::from_reader(spec_file).ok();
|
|
} else {
|
|
remote_extensions = None;
|
|
};
|
|
|
|
// Create config file
|
|
let config = {
|
|
let mut spec = ComputeSpec {
|
|
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
|
format_version: 1.0,
|
|
operation_uuid: None,
|
|
features: self.features.clone(),
|
|
swap_size_bytes: None,
|
|
disk_quota_bytes: None,
|
|
disable_lfc_resizing: None,
|
|
cluster: Cluster {
|
|
cluster_id: None, // project ID: not used
|
|
name: None, // project name: not used
|
|
state: None,
|
|
roles: if args.create_test_user {
|
|
vec![Role {
|
|
name: PgIdent::from_str("test").unwrap(),
|
|
encrypted_password: None,
|
|
options: None,
|
|
}]
|
|
} else {
|
|
Vec::new()
|
|
},
|
|
databases: if args.create_test_user {
|
|
vec![Database {
|
|
name: PgIdent::from_str("neondb").unwrap(),
|
|
owner: PgIdent::from_str("test").unwrap(),
|
|
options: None,
|
|
restrict_conn: false,
|
|
invalid: false,
|
|
}]
|
|
} else {
|
|
Vec::new()
|
|
},
|
|
settings: None,
|
|
postgresql_conf: Some(postgresql_conf.clone()),
|
|
},
|
|
delta_operations: None,
|
|
tenant_id: Some(self.tenant_id),
|
|
timeline_id: Some(self.timeline_id),
|
|
project_id: None,
|
|
branch_id: None,
|
|
endpoint_id: Some(self.endpoint_id.clone()),
|
|
mode: self.mode,
|
|
pageserver_connstring: Some(pageserver_connstring),
|
|
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
|
safekeeper_connstrings,
|
|
storage_auth_token: args.auth_token.clone(),
|
|
remote_extensions,
|
|
pgbouncer_settings: None,
|
|
shard_stripe_size: Some(args.shard_stripe_size),
|
|
local_proxy_config: None,
|
|
reconfigure_concurrency: self.reconfigure_concurrency,
|
|
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
|
audit_log_level: ComputeAudit::Disabled,
|
|
logs_export_host: None::<String>,
|
|
endpoint_storage_addr: Some(args.endpoint_storage_addr),
|
|
endpoint_storage_token: Some(args.endpoint_storage_token),
|
|
autoprewarm: args.autoprewarm,
|
|
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
|
suspend_timeout_seconds: -1, // Only used in neon_local.
|
|
};
|
|
|
|
// this strange code is needed to support respec() in tests
|
|
if self.cluster.is_some() {
|
|
debug!("Cluster is already set in the endpoint spec, using it");
|
|
spec.cluster = self.cluster.clone().unwrap();
|
|
|
|
debug!("spec.cluster {:?}", spec.cluster);
|
|
|
|
// fill missing fields again
|
|
if args.create_test_user {
|
|
spec.cluster.roles.push(Role {
|
|
name: PgIdent::from_str("test").unwrap(),
|
|
encrypted_password: None,
|
|
options: None,
|
|
});
|
|
spec.cluster.databases.push(Database {
|
|
name: PgIdent::from_str("neondb").unwrap(),
|
|
owner: PgIdent::from_str("test").unwrap(),
|
|
options: None,
|
|
restrict_conn: false,
|
|
invalid: false,
|
|
});
|
|
}
|
|
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
|
}
|
|
|
|
ComputeConfig {
|
|
spec: Some(spec),
|
|
compute_ctl_config: self.compute_ctl_config.clone(),
|
|
}
|
|
};
|
|
|
|
let config_path = self.endpoint_path().join("config.json");
|
|
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
|
|
|
|
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
|
|
let logfile = std::fs::OpenOptions::new()
|
|
.create(true)
|
|
.append(true)
|
|
.open(self.endpoint_path().join("compute.log"))?;
|
|
|
|
// Launch compute_ctl
|
|
let conn_str = self.connstr("cloud_admin", "postgres");
|
|
println!("Starting postgres node at '{conn_str}'");
|
|
if args.create_test_user {
|
|
let conn_str = self.connstr("test", "neondb");
|
|
println!("Also at '{conn_str}'");
|
|
}
|
|
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
|
cmd.args([
|
|
"--external-http-port",
|
|
&self.external_http_address.port().to_string(),
|
|
])
|
|
.args([
|
|
"--internal-http-port",
|
|
&self.internal_http_address.port().to_string(),
|
|
])
|
|
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
|
.args(["--connstr", &conn_str])
|
|
.arg("--config")
|
|
.arg(self.endpoint_path().join("config.json").as_os_str())
|
|
.args([
|
|
"--pgbin",
|
|
self.env
|
|
.pg_bin_dir(self.pg_version)?
|
|
.join("postgres")
|
|
.to_str()
|
|
.unwrap(),
|
|
])
|
|
// TODO: It would be nice if we generated compute IDs with the same
|
|
// algorithm as the real control plane.
|
|
.args(["--compute-id", &self.endpoint_id])
|
|
.stdin(std::process::Stdio::null())
|
|
.stderr(logfile.try_clone()?)
|
|
.stdout(logfile);
|
|
|
|
if let Some(remote_ext_base_url) = args.remote_ext_base_url {
|
|
cmd.args(["--remote-ext-base-url", &remote_ext_base_url]);
|
|
}
|
|
|
|
if args.dev {
|
|
cmd.arg("--dev");
|
|
}
|
|
|
|
let child = cmd.spawn()?;
|
|
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
|
let child = scopeguard::guard(child, |mut child| {
|
|
println!("SIGKILL & wait the started process");
|
|
(|| {
|
|
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
|
|
child.kill().context("SIGKILL child")?;
|
|
child.wait().context("wait() for child process")?;
|
|
anyhow::Ok(())
|
|
})()
|
|
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
|
|
.unwrap();
|
|
});
|
|
|
|
// Write down the pid so we can wait for it when we want to stop
|
|
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
|
|
let pid = child.id();
|
|
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
|
std::fs::write(pidfile_path, pid.to_string())?;
|
|
|
|
// Wait for it to start
|
|
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
|
|
let start_at = Instant::now();
|
|
loop {
|
|
match self.get_status().await {
|
|
Ok(state) => {
|
|
match state.status {
|
|
ComputeStatus::Init => {
|
|
let timeout = args.start_timeout;
|
|
if Instant::now().duration_since(start_at) > timeout {
|
|
bail!(
|
|
"compute startup timed out {:?}; still in Init state",
|
|
timeout
|
|
);
|
|
}
|
|
// keep retrying
|
|
}
|
|
ComputeStatus::Running => {
|
|
// All good!
|
|
break;
|
|
}
|
|
ComputeStatus::Failed => {
|
|
bail!(
|
|
"compute startup failed: {}",
|
|
state
|
|
.error
|
|
.as_deref()
|
|
.unwrap_or("<no error from compute_ctl>")
|
|
);
|
|
}
|
|
ComputeStatus::Empty
|
|
| ComputeStatus::ConfigurationPending
|
|
| ComputeStatus::Configuration
|
|
| ComputeStatus::TerminationPending { .. }
|
|
| ComputeStatus::Terminated => {
|
|
bail!("unexpected compute status: {:?}", state.status)
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
if Instant::now().duration_since(start_at) > args.start_timeout {
|
|
return Err(e).context(format!(
|
|
"timed out {:?} waiting to connect to compute_ctl HTTP",
|
|
args.start_timeout
|
|
));
|
|
}
|
|
}
|
|
}
|
|
tokio::time::sleep(ATTEMPT_INTERVAL).await;
|
|
}
|
|
|
|
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
|
drop(scopeguard::ScopeGuard::into_inner(child));
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Call the /status HTTP API
|
|
pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
|
|
let client = reqwest::Client::new();
|
|
|
|
let response = client
|
|
.request(
|
|
reqwest::Method::GET,
|
|
format!(
|
|
"http://{}:{}/status",
|
|
self.external_http_address.ip(),
|
|
self.external_http_address.port()
|
|
),
|
|
)
|
|
.bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
|
|
.send()
|
|
.await?;
|
|
|
|
// Interpret the response
|
|
let status = response.status();
|
|
if !(status.is_client_error() || status.is_server_error()) {
|
|
Ok(response.json().await?)
|
|
} else {
|
|
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
|
let url = response.url().to_owned();
|
|
let msg = match response.text().await {
|
|
Ok(err_body) => format!("Error: {err_body}"),
|
|
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
|
};
|
|
Err(anyhow::anyhow!(msg))
|
|
}
|
|
}
|
|
|
|
pub async fn reconfigure(
|
|
&self,
|
|
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
|
stripe_size: Option<ShardStripeSize>,
|
|
safekeepers: Option<Vec<NodeId>>,
|
|
safekeeper_generation: Option<SafekeeperGeneration>,
|
|
) -> Result<()> {
|
|
let (mut spec, compute_ctl_config) = {
|
|
let config_path = self.endpoint_path().join("config.json");
|
|
let file = std::fs::File::open(config_path)?;
|
|
let config: ComputeConfig = serde_json::from_reader(file)?;
|
|
|
|
(config.spec.unwrap(), config.compute_ctl_config)
|
|
};
|
|
|
|
let postgresql_conf = self.read_postgresql_conf()?;
|
|
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
|
|
|
// If pageservers are not specified, don't change them.
|
|
if let Some(pageservers) = pageservers {
|
|
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
|
|
|
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
|
spec.pageserver_connstring = Some(pageserver_connstr);
|
|
if stripe_size.is_some() {
|
|
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
|
}
|
|
}
|
|
|
|
// If safekeepers are not specified, don't change them.
|
|
if let Some(safekeepers) = safekeepers {
|
|
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
|
spec.safekeeper_connstrings = safekeeper_connstrings;
|
|
if let Some(g) = safekeeper_generation {
|
|
spec.safekeepers_generation = Some(g.into_inner());
|
|
}
|
|
}
|
|
|
|
let client = reqwest::Client::builder()
|
|
.timeout(Duration::from_secs(120))
|
|
.build()
|
|
.unwrap();
|
|
let response = client
|
|
.post(format!(
|
|
"http://{}:{}/configure",
|
|
self.external_http_address.ip(),
|
|
self.external_http_address.port()
|
|
))
|
|
.header(CONTENT_TYPE.as_str(), "application/json")
|
|
.bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
|
|
.body(
|
|
serde_json::to_string(&ConfigurationRequest {
|
|
spec,
|
|
compute_ctl_config,
|
|
})
|
|
.unwrap(),
|
|
)
|
|
.send()
|
|
.await?;
|
|
|
|
let status = response.status();
|
|
if !(status.is_client_error() || status.is_server_error()) {
|
|
Ok(())
|
|
} else {
|
|
let url = response.url().to_owned();
|
|
let msg = match response.text().await {
|
|
Ok(err_body) => format!("Error: {err_body}"),
|
|
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
|
};
|
|
Err(anyhow::anyhow!(msg))
|
|
}
|
|
}
|
|
|
|
pub async fn reconfigure_pageservers(
|
|
&self,
|
|
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
|
stripe_size: Option<ShardStripeSize>,
|
|
) -> Result<()> {
|
|
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
|
.await
|
|
}
|
|
|
|
pub async fn reconfigure_safekeepers(
|
|
&self,
|
|
safekeepers: Vec<NodeId>,
|
|
generation: SafekeeperGeneration,
|
|
) -> Result<()> {
|
|
self.reconfigure(None, None, Some(safekeepers), Some(generation))
|
|
.await
|
|
}
|
|
|
|
pub async fn stop(
|
|
&self,
|
|
mode: EndpointTerminateMode,
|
|
destroy: bool,
|
|
) -> Result<TerminateResponse> {
|
|
// pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is
|
|
// slow, and test runs time out. Solution: special mode "immediate-terminate"
|
|
// which uses /terminate
|
|
let response = if let EndpointTerminateMode::ImmediateTerminate = mode {
|
|
let ip = self.external_http_address.ip();
|
|
let port = self.external_http_address.port();
|
|
let url = format!("http://{ip}:{port}/terminate?mode=immediate");
|
|
let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?;
|
|
let request = reqwest::Client::new().post(url).bearer_auth(token);
|
|
let response = request.send().await.context("/terminate")?;
|
|
let text = response.text().await.context("/terminate result")?;
|
|
serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))?
|
|
} else {
|
|
self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?;
|
|
TerminateResponse { lsn: None }
|
|
};
|
|
|
|
// Also wait for the compute_ctl process to die. It might have some
|
|
// cleanup work to do after postgres stops, like syncing safekeepers,
|
|
// etc.
|
|
//
|
|
// If destroying or stop mode is immediate, send it SIGTERM before
|
|
// waiting. Sometimes we do *not* want this cleanup: tests intentionally
|
|
// do stop when majority of safekeepers is down, so sync-safekeepers
|
|
// would hang otherwise. This could be a separate flag though.
|
|
let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast);
|
|
self.wait_for_compute_ctl_to_exit(send_sigterm)?;
|
|
if destroy {
|
|
println!(
|
|
"Destroying postgres data directory '{}'",
|
|
self.pgdata().to_str().unwrap()
|
|
);
|
|
std::fs::remove_dir_all(self.endpoint_path())?;
|
|
}
|
|
Ok(response)
|
|
}
|
|
|
|
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
|
format!(
|
|
"postgresql://{}@{}:{}/{}",
|
|
user,
|
|
self.pg_address.ip(),
|
|
self.pg_address.port(),
|
|
db_name
|
|
)
|
|
}
|
|
}
|