mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
## Problem We currently offload LFC state unconditionally, which can cause problems. Imagine a situation: 1. Endpoint started with `autoprewarm: true`. 2. While prewarming is not completed, we upload the new incomplete state. 3. Compute gets interrupted and restarts. 4. We start again and try to prewarm with the state from 2. instead of the previous complete state. During the orchestrated prewarming, it's probably not a big issue, but it's still better to do not interfere with the prewarm process. ## Summary of changes Do not offload LFC state if we are currently prewarming or any issue occurred. While on it, also introduce `Skipped` LFC prewarm status, which is used when the corresponding LFC state is not present in the endpoint storage. It's primarily needed to distinguish the first compute start for particular endpoint, as it's completely valid to do not have LFC state yet.
331 lines
11 KiB
Rust
331 lines
11 KiB
Rust
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
|
|
|
use std::fmt::Display;
|
|
|
|
use chrono::{DateTime, Utc};
|
|
use jsonwebtoken::jwk::JwkSet;
|
|
use serde::{Deserialize, Serialize, Serializer};
|
|
|
|
use crate::privilege::Privilege;
|
|
use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role};
|
|
|
|
#[derive(Serialize, Debug, Deserialize)]
|
|
pub struct GenericAPIError {
|
|
pub error: String,
|
|
}
|
|
|
|
/// All configuration parameters necessary for a compute. When
|
|
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
|
|
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
|
|
/// and contains parameters necessary for operating `compute_ctl` independently
|
|
/// of whether a tenant is attached to the compute or not.
|
|
///
|
|
/// This also happens to be the body of `compute_ctl`'s /configure request.
|
|
#[derive(Debug, Deserialize, Serialize)]
|
|
pub struct ComputeConfig {
|
|
/// The compute spec
|
|
pub spec: Option<ComputeSpec>,
|
|
|
|
/// The compute_ctl configuration
|
|
#[allow(dead_code)]
|
|
pub compute_ctl_config: ComputeCtlConfig,
|
|
}
|
|
|
|
impl From<ControlPlaneConfigResponse> for ComputeConfig {
|
|
fn from(value: ControlPlaneConfigResponse) -> Self {
|
|
Self {
|
|
spec: value.spec,
|
|
compute_ctl_config: value.compute_ctl_config,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ExtensionInstallResponse {
|
|
pub extension: PgIdent,
|
|
pub version: ExtVersion,
|
|
}
|
|
|
|
/// Status of the LFC prewarm process. The same state machine is reused for
|
|
/// both autoprewarm (prewarm after compute/Postgres start using the previously
|
|
/// stored LFC state) and explicit prewarming via API.
|
|
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
|
#[serde(tag = "status", rename_all = "snake_case")]
|
|
pub enum LfcPrewarmState {
|
|
/// Default value when compute boots up.
|
|
#[default]
|
|
NotPrewarmed,
|
|
/// Prewarming thread is active and loading pages into LFC.
|
|
Prewarming,
|
|
/// We found requested LFC state in the endpoint storage and
|
|
/// completed prewarming successfully.
|
|
Completed,
|
|
/// Unexpected error happened during prewarming. Note, `Not Found 404`
|
|
/// response from the endpoint storage is explicitly excluded here
|
|
/// because it can normally happen on the first compute start,
|
|
/// since LFC state is not available yet.
|
|
Failed { error: String },
|
|
/// We tried to fetch the corresponding LFC state from the endpoint storage,
|
|
/// but received `Not Found 404`. This should normally happen only during the
|
|
/// first endpoint start after creation with `autoprewarm: true`.
|
|
///
|
|
/// During the orchestrated prewarm via API, when a caller explicitly
|
|
/// provides the LFC state key to prewarm from, it's the caller responsibility
|
|
/// to handle this status as an error state in this case.
|
|
Skipped,
|
|
}
|
|
|
|
impl Display for LfcPrewarmState {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
|
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
|
LfcPrewarmState::Completed => f.write_str("Completed"),
|
|
LfcPrewarmState::Skipped => f.write_str("Skipped"),
|
|
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
|
#[serde(tag = "status", rename_all = "snake_case")]
|
|
pub enum LfcOffloadState {
|
|
#[default]
|
|
NotOffloaded,
|
|
Offloading,
|
|
Completed,
|
|
Failed {
|
|
error: String,
|
|
},
|
|
}
|
|
|
|
#[derive(Serialize, Debug, Clone, PartialEq)]
|
|
#[serde(tag = "status", rename_all = "snake_case")]
|
|
/// Response of /promote
|
|
pub enum PromoteState {
|
|
NotPromoted,
|
|
Completed,
|
|
Failed { error: String },
|
|
}
|
|
|
|
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
|
#[serde(rename_all = "snake_case")]
|
|
/// Result of /safekeepers_lsn
|
|
pub struct SafekeepersLsn {
|
|
pub safekeepers: String,
|
|
pub wal_flush_lsn: utils::lsn::Lsn,
|
|
}
|
|
|
|
/// Response of the /status API
|
|
#[derive(Serialize, Debug, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub struct ComputeStatusResponse {
|
|
pub start_time: DateTime<Utc>,
|
|
pub tenant: Option<String>,
|
|
pub timeline: Option<String>,
|
|
pub status: ComputeStatus,
|
|
#[serde(serialize_with = "rfc3339_serialize")]
|
|
pub last_active: Option<DateTime<Utc>>,
|
|
pub error: Option<String>,
|
|
}
|
|
|
|
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum TerminateMode {
|
|
#[default]
|
|
/// wait 30s till returning from /terminate to allow control plane to get the error
|
|
Fast,
|
|
/// return from /terminate immediately as soon as all components are terminated
|
|
Immediate,
|
|
}
|
|
|
|
impl From<TerminateMode> for ComputeStatus {
|
|
fn from(mode: TerminateMode) -> Self {
|
|
match mode {
|
|
TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
|
|
TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum ComputeStatus {
|
|
// Spec wasn't provided at start, waiting for it to be
|
|
// provided by control-plane.
|
|
Empty,
|
|
// Compute configuration was requested.
|
|
ConfigurationPending,
|
|
// Compute node has spec and initial startup and
|
|
// configuration is in progress.
|
|
Init,
|
|
// Compute is configured and running.
|
|
Running,
|
|
// New spec is being applied.
|
|
Configuration,
|
|
// Either startup or configuration failed,
|
|
// compute will exit soon or is waiting for
|
|
// control-plane to terminate it.
|
|
Failed,
|
|
// Termination requested
|
|
TerminationPendingFast,
|
|
// Termination requested, without waiting 30s before returning from /terminate
|
|
TerminationPendingImmediate,
|
|
// Terminated Postgres
|
|
Terminated,
|
|
}
|
|
|
|
#[derive(Deserialize, Serialize)]
|
|
pub struct TerminateResponse {
|
|
pub lsn: Option<utils::lsn::Lsn>,
|
|
}
|
|
|
|
impl Display for ComputeStatus {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
ComputeStatus::Empty => f.write_str("empty"),
|
|
ComputeStatus::ConfigurationPending => f.write_str("configuration-pending"),
|
|
ComputeStatus::Init => f.write_str("init"),
|
|
ComputeStatus::Running => f.write_str("running"),
|
|
ComputeStatus::Configuration => f.write_str("configuration"),
|
|
ComputeStatus::Failed => f.write_str("failed"),
|
|
ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
|
|
ComputeStatus::TerminationPendingImmediate => {
|
|
f.write_str("termination-pending-immediate")
|
|
}
|
|
ComputeStatus::Terminated => f.write_str("terminated"),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
|
where
|
|
S: Serializer,
|
|
{
|
|
if let Some(x) = x {
|
|
x.to_rfc3339().serialize(s)
|
|
} else {
|
|
s.serialize_none()
|
|
}
|
|
}
|
|
|
|
/// Response of the /metrics.json API
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct ComputeMetrics {
|
|
/// Time spent waiting in pool
|
|
pub wait_for_spec_ms: u64,
|
|
|
|
/// Time spent checking if safekeepers are synced
|
|
pub sync_sk_check_ms: u64,
|
|
|
|
/// Time spent syncing safekeepers (walproposer.c).
|
|
/// In most cases this should be zero.
|
|
pub sync_safekeepers_ms: u64,
|
|
|
|
/// Time it took to establish a pg connection to the pageserver.
|
|
/// This is two roundtrips, so it's a good proxy for compute-pageserver
|
|
/// latency. The latency is usually 0.2ms, but it's not safe to assume
|
|
/// that.
|
|
pub pageserver_connect_micros: u64,
|
|
|
|
/// Time to get basebackup from pageserver and write it to disk.
|
|
pub basebackup_ms: u64,
|
|
|
|
/// Compressed size of basebackup received.
|
|
pub basebackup_bytes: u64,
|
|
|
|
/// Time spent starting potgres. This includes initialization of shared
|
|
/// buffers, preloading extensions, and other pg operations.
|
|
pub start_postgres_ms: u64,
|
|
|
|
/// Time spent applying pg catalog updates that were made in the console
|
|
/// UI. This should be 0 when startup time matters, since cplane tries
|
|
/// to do these updates eagerly, and passes the skip_pg_catalog_updates
|
|
/// when it's safe to skip this step.
|
|
pub config_ms: u64,
|
|
|
|
/// Total time, from when we receive the spec to when we're ready to take
|
|
/// pg connections.
|
|
pub total_startup_ms: u64,
|
|
pub load_ext_ms: u64,
|
|
pub num_ext_downloaded: u64,
|
|
pub largest_ext_size: u64, // these are measured in bytes
|
|
pub total_ext_download_size: u64,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct CatalogObjects {
|
|
pub roles: Vec<Role>,
|
|
pub databases: Vec<Database>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
|
|
pub struct ComputeCtlConfig {
|
|
/// Set of JSON web keys that the compute can use to authenticate
|
|
/// communication from the control plane.
|
|
pub jwks: JwkSet,
|
|
pub tls: Option<TlsConfig>,
|
|
}
|
|
|
|
impl Default for ComputeCtlConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
jwks: JwkSet {
|
|
keys: Vec::default(),
|
|
},
|
|
tls: None,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
|
|
pub struct TlsConfig {
|
|
pub key_path: String,
|
|
pub cert_path: String,
|
|
}
|
|
|
|
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
|
#[derive(Deserialize, Debug)]
|
|
pub struct ControlPlaneConfigResponse {
|
|
pub spec: Option<ComputeSpec>,
|
|
pub status: ControlPlaneComputeStatus,
|
|
pub compute_ctl_config: ComputeCtlConfig,
|
|
}
|
|
|
|
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum ControlPlaneComputeStatus {
|
|
// Compute is known to control-plane, but it's not
|
|
// yet attached to any timeline / endpoint.
|
|
Empty,
|
|
// Compute is attached to some timeline / endpoint and
|
|
// should be able to start with provided spec.
|
|
Attached,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct InstalledExtension {
|
|
pub extname: String,
|
|
pub version: String,
|
|
pub n_databases: u32, // Number of databases using this extension
|
|
pub owned_by_superuser: String,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct InstalledExtensions {
|
|
pub extensions: Vec<InstalledExtension>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct ExtensionInstallResult {
|
|
pub extension: PgIdent,
|
|
pub version: ExtVersion,
|
|
}
|
|
#[derive(Clone, Debug, Default, Serialize)]
|
|
pub struct SetRoleGrantsResponse {
|
|
pub database: PgIdent,
|
|
pub schema: PgIdent,
|
|
pub privileges: Vec<Privilege>,
|
|
pub role: PgIdent,
|
|
}
|