[compute_ctl] Improve 'empty' compute startup sequence (#4034)

Do several attempts to get spec from the control-plane and retry network
errors and all reasonable HTTP response codes. Do not hang waiting for
spec without confirmation from the control-plane that compute is known
and is in the `Empty` state.

Adjust the way we track `total_startup_ms` metric, it should be
calculated since the moment we received spec, not from the moment
`compute_ctl` started. Also introduce a new `wait_for_spec_ms` metric
to track the time spent sleeping and waiting for spec to be delivered
from control-plane.

Part of neondatabase/cloud#3533
This commit is contained in:
Alexey Kondratov
2023-04-21 11:10:48 +02:00
committed by GitHub
parent 02b28ae0b1
commit 7ba5c286b7
6 changed files with 141 additions and 26 deletions

View File

@@ -73,7 +73,7 @@ fn main() -> Result<()> {
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let mut spec = None;
let spec;
let mut live_config_allowed = false;
match spec_json {
// First, try to get cluster spec from the cli argument
@@ -89,9 +89,13 @@ fn main() -> Result<()> {
} else if let Some(id) = compute_id {
if let Some(cp_base) = control_plane_uri {
live_config_allowed = true;
if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
spec = Some(s);
}
spec = match get_spec_from_control_plane(cp_base, id) {
Ok(s) => s,
Err(e) => {
error!("cannot get response from control plane: {}", e);
panic!("neither spec nor confirmation that compute is in the Empty state was received");
}
};
} else {
panic!("must specify both --control-plane-uri and --compute-id or none");
}
@@ -114,7 +118,6 @@ fn main() -> Result<()> {
spec_set = false;
}
let compute_node = ComputeNode {
start_time: Utc::now(),
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
@@ -147,6 +150,17 @@ fn main() -> Result<()> {
let mut state = compute.state.lock().unwrap();
let pspec = state.pspec.as_ref().expect("spec must be set");
let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
// Record for how long we slept waiting for the spec.
state.metrics.wait_for_spec_ms = Utc::now()
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time to the actual start of the configuration, so that
// total startup time was properly measured at the end.
state.start_time = Utc::now();
state.status = ComputeStatus::Init;
compute.state_changed.notify_all();
drop(state);

View File

@@ -38,7 +38,6 @@ use crate::spec::*;
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
pub start_time: DateTime<Utc>,
// Url type maintains proper escaping
pub connstr: url::Url,
pub pgdata: String,
@@ -66,6 +65,7 @@ pub struct ComputeNode {
#[derive(Clone, Debug)]
pub struct ComputeState {
pub start_time: DateTime<Utc>,
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
pub last_active: DateTime<Utc>,
@@ -77,6 +77,7 @@ pub struct ComputeState {
impl ComputeState {
pub fn new() -> Self {
Self {
start_time: Utc::now(),
status: ComputeStatus::Empty,
last_active: Utc::now(),
error: None,
@@ -425,7 +426,7 @@ impl ComputeNode {
.unwrap()
.as_millis() as u64;
state.metrics.total_startup_ms = startup_end_time
.signed_duration_since(self.start_time)
.signed_duration_since(compute_state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;

View File

@@ -18,6 +18,7 @@ use tracing_utils::http::OtelName;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
start_time: state.start_time,
tenant: state
.pspec
.as_ref()

View File

@@ -152,11 +152,14 @@ components:
type: object
description: Compute startup metrics.
required:
- wait_for_spec_ms
- sync_safekeepers_ms
- basebackup_ms
- config_ms
- total_startup_ms
properties:
wait_for_spec_ms:
type: integer
sync_safekeepers_ms:
type: integer
basebackup_ms:
@@ -181,6 +184,13 @@ components:
- status
- last_active
properties:
start_time:
type: string
description: |
Time when compute was started. If initially compute was started in the `empty`
state and then provided with valid spec, `start_time` will be reset to the
moment, when spec was received.
example: "2022-10-12T07:20:50.52Z"
status:
$ref: '#/components/schemas/ComputeStatus'
last_active:

View File

@@ -4,42 +4,117 @@ use std::str::FromStr;
use anyhow::{anyhow, bail, Result};
use postgres::config::Config;
use postgres::{Client, NoTls};
use tracing::{info, info_span, instrument, span_enabled, warn, Level};
use reqwest::StatusCode;
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
use crate::config;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
use compute_api::responses::ControlPlaneSpecResponse;
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
// Do control plane request and return response if any. In case of error it
// returns a bool flag indicating whether it makes sense to retry the request
// and a string with error message.
fn do_control_plane_request(
uri: &str,
jwt: &str,
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
let resp = reqwest::blocking::Client::new()
.get(uri)
.header("Authorization", jwt)
.send()
.map_err(|e| {
(
true,
format!("could not perform spec request to control plane: {}", e),
)
})?;
match resp.status() {
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
Ok(spec_resp) => Ok(spec_resp),
Err(e) => Err((
true,
format!("could not deserialize control plane response: {}", e),
)),
},
StatusCode::SERVICE_UNAVAILABLE => {
Err((true, "control plane is temporarily unavailable".to_string()))
}
StatusCode::BAD_GATEWAY => {
// We have a problem with intermittent 502 errors now
// https://github.com/neondatabase/cloud/issues/2353
// It's fine to retry GET request in this case.
Err((true, "control plane request failed with 502".to_string()))
}
// Another code, likely 500 or 404, means that compute is unknown to the control plane
// or some internal failure happened. Doesn't make much sense to retry in this case.
_ => Err((
false,
format!(
"unexpected control plane response status code: {}",
resp.status()
),
)),
}
}
/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
/// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<Option<ComputeSpec>> {
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
let mut spec: Result<Option<ComputeSpec>> = Ok(None);
info!("getting spec from control plane: {}", cp_uri);
// TODO: check the response. We should distinguish cases when it's
// - network error, then retry
// - no spec for compute yet, then wait
// - compute id is unknown or any other error, then bail out
let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new()
.get(cp_uri)
.header("Authorization", jwt)
.send()
.map_err(|e| anyhow!("could not send spec request to control plane: {}", e))?
.json()
.map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?;
// Do 3 attempts to get spec from the control plane using the following logic:
// - network error -> then retry
// - compute id is unknown or any other error -> bail out
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
spec = match do_control_plane_request(&cp_uri, &jwt) {
Ok(spec_resp) => match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok(None),
ControlPlaneComputeStatus::Attached => {
if let Some(spec) = spec_resp.spec {
Ok(Some(spec))
} else {
bail!("compute is attached, but spec is empty")
}
}
},
Err((retry, msg)) => {
if retry {
Err(anyhow!(msg))
} else {
bail!(msg);
}
}
};
if let Some(spec) = resp.spec {
Ok(spec)
} else {
bail!("could not get compute spec from control plane")
if let Err(e) = &spec {
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return spec;
}
attempt += 1;
std::thread::sleep(std::time::Duration::from_millis(100));
}
// All attempts failed, return error.
spec
}
/// It takes cluster specification and does the following:

View File

@@ -14,6 +14,7 @@ pub struct GenericAPIError {
#[derive(Serialize, Debug)]
#[serde(rename_all = "snake_case")]
pub struct ComputeStatusResponse {
pub start_time: DateTime<Utc>,
pub tenant: Option<String>,
pub timeline: Option<String>,
pub status: ComputeStatus,
@@ -63,6 +64,7 @@ where
/// Response of the /metrics.json API
#[derive(Clone, Debug, Default, Serialize)]
pub struct ComputeMetrics {
pub wait_for_spec_ms: u64,
pub sync_safekeepers_ms: u64,
pub basebackup_ms: u64,
pub config_ms: u64,
@@ -75,4 +77,16 @@ pub struct ComputeMetrics {
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
}
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ControlPlaneComputeStatus {
// Compute is known to control-plane, but it's not
// yet attached to any timeline / endpoint.
Empty,
// Compute is attached to some timeline / endpoint and
// should be able to start with provided spec.
Attached,
}