mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 03:20:36 +00:00
Merge branch 'main' into arpad/endpoint_azure
This commit is contained in:
@@ -33,6 +33,7 @@ remote_storage.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
nix = {workspace = true, optional = true}
|
||||
reqwest.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
|
||||
@@ -97,6 +97,15 @@ pub struct ConfigToml {
|
||||
pub control_plane_api: Option<reqwest::Url>,
|
||||
pub control_plane_api_token: Option<String>,
|
||||
pub control_plane_emergency_mode: bool,
|
||||
/// Unstable feature: subject to change or removal without notice.
|
||||
/// See <https://github.com/neondatabase/neon/pull/9218>.
|
||||
pub import_pgdata_upcall_api: Option<reqwest::Url>,
|
||||
/// Unstable feature: subject to change or removal without notice.
|
||||
/// See <https://github.com/neondatabase/neon/pull/9218>.
|
||||
pub import_pgdata_upcall_api_token: Option<String>,
|
||||
/// Unstable feature: subject to change or removal without notice.
|
||||
/// See <https://github.com/neondatabase/neon/pull/9218>.
|
||||
pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
|
||||
pub heatmap_upload_concurrency: usize,
|
||||
pub secondary_download_concurrency: usize,
|
||||
pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
|
||||
@@ -386,6 +395,10 @@ impl Default for ConfigToml {
|
||||
control_plane_api_token: (None),
|
||||
control_plane_emergency_mode: (false),
|
||||
|
||||
import_pgdata_upcall_api: (None),
|
||||
import_pgdata_upcall_api_token: (None),
|
||||
import_pgdata_aws_endpoint_url: (None),
|
||||
|
||||
heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||
secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ pub struct ShardedRange<'a> {
|
||||
|
||||
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||
// top page in the previous relation's space.
|
||||
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
debug_assert!(is_contiguous_range(range));
|
||||
if range.start.field6 == 0xffffffff {
|
||||
range.end.field6 + 1
|
||||
@@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
/// This matters, because:
|
||||
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
pub fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
range.start.field1 == range.end.field1
|
||||
&& range.start.field2 == range.end.field2
|
||||
&& range.start.field3 == range.end.field3
|
||||
|
||||
@@ -2,6 +2,8 @@ pub mod detach_ancestor;
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use camino::Utf8PathBuf;
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
@@ -227,6 +229,9 @@ pub enum TimelineCreateRequestMode {
|
||||
// we continue to accept it by having it here.
|
||||
pg_version: Option<u32>,
|
||||
},
|
||||
ImportPgdata {
|
||||
import_pgdata: TimelineCreateRequestModeImportPgdata,
|
||||
},
|
||||
// NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
|
||||
// (serde picks the first matching enum variant, in declaration order).
|
||||
Bootstrap {
|
||||
@@ -236,6 +241,42 @@ pub enum TimelineCreateRequestMode {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineCreateRequestModeImportPgdata {
|
||||
pub location: ImportPgdataLocation,
|
||||
pub idempotency_key: ImportPgdataIdempotencyKey,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub enum ImportPgdataLocation {
|
||||
#[cfg(feature = "testing")]
|
||||
LocalFs { path: Utf8PathBuf },
|
||||
AwsS3 {
|
||||
region: String,
|
||||
bucket: String,
|
||||
/// A better name for this would be `prefix`; changing requires coordination with cplane.
|
||||
/// See <https://github.com/neondatabase/cloud/issues/20646>.
|
||||
key: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(transparent)]
|
||||
pub struct ImportPgdataIdempotencyKey(pub String);
|
||||
|
||||
impl ImportPgdataIdempotencyKey {
|
||||
pub fn random() -> Self {
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
Self(
|
||||
rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(20)
|
||||
.map(char::from)
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LsnLeaseRequest {
|
||||
pub lsn: Lsn,
|
||||
|
||||
12
libs/postgres_initdb/Cargo.toml
Normal file
12
libs/postgres_initdb/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "postgres_initdb"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
tokio.workspace = true
|
||||
camino.workspace = true
|
||||
thiserror.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
103
libs/postgres_initdb/src/lib.rs
Normal file
103
libs/postgres_initdb/src/lib.rs
Normal file
@@ -0,0 +1,103 @@
|
||||
//! The canonical way we run `initdb` in Neon.
|
||||
//!
|
||||
//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations.
|
||||
//!
|
||||
//! This module's job is to eliminate the environment-dependence as much as possible.
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use camino::Utf8Path;
|
||||
|
||||
pub struct RunInitdbArgs<'a> {
|
||||
pub superuser: &'a str,
|
||||
pub locale: &'a str,
|
||||
pub initdb_bin: &'a Utf8Path,
|
||||
pub pg_version: u32,
|
||||
pub library_search_path: &'a Utf8Path,
|
||||
pub pgdata: &'a Utf8Path,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
Spawn(std::io::Error),
|
||||
Failed {
|
||||
status: std::process::ExitStatus,
|
||||
stderr: Vec<u8>,
|
||||
},
|
||||
WaitOutput(std::io::Error),
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e),
|
||||
Error::Failed { status, stderr } => write!(
|
||||
f,
|
||||
"Command failed with status {:?}: {}",
|
||||
status,
|
||||
String::from_utf8_lossy(stderr)
|
||||
),
|
||||
Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e),
|
||||
Error::Other(e) => write!(f, "Error: {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
|
||||
let RunInitdbArgs {
|
||||
superuser,
|
||||
locale,
|
||||
initdb_bin: initdb_bin_path,
|
||||
pg_version,
|
||||
library_search_path,
|
||||
pgdata,
|
||||
} = args;
|
||||
let mut initdb_command = tokio::process::Command::new(initdb_bin_path);
|
||||
initdb_command
|
||||
.args(["--pgdata", pgdata.as_ref()])
|
||||
.args(["--username", superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["--locale", locale])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", library_search_path)
|
||||
.env("DYLD_LIBRARY_PATH", library_search_path)
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped());
|
||||
|
||||
// Before version 14, only the libc provide was available.
|
||||
if pg_version > 14 {
|
||||
// Version 17 brought with it a builtin locale provider which only provides
|
||||
// C and C.UTF-8. While being safer for collation purposes since it is
|
||||
// guaranteed to be consistent throughout a major release, it is also more
|
||||
// performant.
|
||||
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
|
||||
|
||||
initdb_command.args(["--locale-provider", locale_provider]);
|
||||
}
|
||||
|
||||
let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_proc
|
||||
.wait_with_output()
|
||||
.await
|
||||
.map_err(Error::WaitOutput)?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(Error::Failed {
|
||||
status: initdb_output.status,
|
||||
stderr: initdb_output.stderr,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -184,9 +184,8 @@ pub struct CancelKeyData {
|
||||
|
||||
impl fmt::Display for CancelKeyData {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
// TODO: this is producing strange results, with 0xffffffff........ always in the logs.
|
||||
let hi = (self.backend_pid as u64) << 32;
|
||||
let lo = self.cancel_key as u64;
|
||||
let lo = (self.cancel_key as u64) & 0xffffffff;
|
||||
let id = hi | lo;
|
||||
|
||||
// This format is more compact and might work better for logs.
|
||||
@@ -1047,4 +1046,13 @@ mod tests {
|
||||
let data = [0, 0, 0, 7, 0, 0, 0, 0];
|
||||
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cancel_key_data() {
|
||||
let key = CancelKeyData {
|
||||
backend_pid: -1817212860,
|
||||
cancel_key: -1183897012,
|
||||
};
|
||||
assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
use futures::stream::Stream;
|
||||
use futures::FutureExt;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
@@ -32,6 +33,7 @@ use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
use utils::backoff::exponential_backoff_duration_seconds;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
@@ -311,40 +313,59 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let mut next_marker = None;
|
||||
|
||||
let mut timeout_try_cnt = 1;
|
||||
|
||||
'outer: loop {
|
||||
let mut builder = builder.clone();
|
||||
if let Some(marker) = next_marker.clone() {
|
||||
builder = builder.marker(marker);
|
||||
}
|
||||
let response = builder.into_stream();
|
||||
let response = response.into_stream().map_err(to_download_error);
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
// Azure Blob Rust SDK does not expose the list blob API directly. Users have to use
|
||||
// their pageable iterator wrapper that returns all keys as a stream. We want to have
|
||||
// full control of paging, and therefore we only take the first item from the stream.
|
||||
let mut response_stream = builder.into_stream();
|
||||
let response = response_stream.next();
|
||||
// Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request
|
||||
// would immediately succeed. Therefore, we use exponential backoff timeout to retry the request.
|
||||
// (Usually, exponential backoff is used to determine the sleep time between two retries.) We
|
||||
// start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures.
|
||||
// timeout = min(5 * (1.0+1.0)^n, self.timeout).
|
||||
let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64());
|
||||
let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response);
|
||||
let response = response.map(|res| {
|
||||
match res {
|
||||
Ok(Some(Ok(res))) => Ok(Some(res)),
|
||||
Ok(Some(Err(e))) => Err(to_download_error(e)),
|
||||
Ok(None) => Ok(None),
|
||||
Err(_elasped) => Err(DownloadError::Timeout),
|
||||
}
|
||||
});
|
||||
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
let next_item = tokio::select! {
|
||||
op = response.next() => Ok(op),
|
||||
op = response => op,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}?;
|
||||
};
|
||||
|
||||
if let Err(DownloadError::Timeout) = &next_item {
|
||||
timeout_try_cnt += 1;
|
||||
if timeout_try_cnt <= 5 {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let next_item = next_item?;
|
||||
|
||||
if timeout_try_cnt >= 2 {
|
||||
tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
|
||||
}
|
||||
timeout_try_cnt = 1;
|
||||
|
||||
let Some(entry) = next_item else {
|
||||
// The list is complete, so yield it.
|
||||
break;
|
||||
};
|
||||
|
||||
let mut res = Listing::default();
|
||||
let entry = match entry {
|
||||
Ok(entry) => entry,
|
||||
Err(e) => {
|
||||
// The error is potentially retryable, so we must rewind the loop after yielding.
|
||||
yield Err(e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
next_marker = entry.continuation();
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
@@ -360,7 +381,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
last_modified: k.properties.last_modified.into(),
|
||||
size: k.properties.content_length,
|
||||
}
|
||||
);
|
||||
);
|
||||
|
||||
for key in blob_iter {
|
||||
res.keys.push(key);
|
||||
|
||||
@@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs {
|
||||
let mut objects = Vec::with_capacity(keys.len());
|
||||
for key in keys {
|
||||
let path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&path).await?;
|
||||
let metadata = file_metadata(&path).await;
|
||||
if let Err(DownloadError::NotFound) = metadata {
|
||||
// Race: if the file is deleted between listing and metadata check, ignore it.
|
||||
continue;
|
||||
}
|
||||
let metadata = metadata?;
|
||||
if metadata.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
pprof.workspace = true
|
||||
regex.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use anyhow::Context;
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use crate::http::request::{get_query_param, parse_query_param};
|
||||
use anyhow::{anyhow, Context};
|
||||
use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
|
||||
@@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::io::Write as _;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use pprof::protos::Message as _;
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Generates CPU profiles.
|
||||
pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
enum Format {
|
||||
Pprof,
|
||||
Svg,
|
||||
}
|
||||
|
||||
// Parameters.
|
||||
let format = match get_query_param(&req, "format")?.as_deref() {
|
||||
None => Format::Pprof,
|
||||
Some("pprof") => Format::Pprof,
|
||||
Some("svg") => Format::Svg,
|
||||
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
|
||||
};
|
||||
let seconds = match parse_query_param(&req, "seconds")? {
|
||||
None => 5,
|
||||
Some(seconds @ 1..=30) => seconds,
|
||||
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
|
||||
};
|
||||
let frequency_hz = match parse_query_param(&req, "frequency")? {
|
||||
None => 99,
|
||||
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
|
||||
Some(frequency) => frequency,
|
||||
};
|
||||
|
||||
// Only allow one profiler at a time.
|
||||
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
let _lock = PROFILE_LOCK
|
||||
.try_lock()
|
||||
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
|
||||
|
||||
// Take the profile.
|
||||
let report = tokio::task::spawn_blocking(move || {
|
||||
let guard = pprof::ProfilerGuardBuilder::default()
|
||||
.frequency(frequency_hz)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()?;
|
||||
std::thread::sleep(Duration::from_secs(seconds));
|
||||
guard.report().build()
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
|
||||
|
||||
// Return the report in the requested format.
|
||||
match format {
|
||||
Format::Pprof => {
|
||||
let mut body = Vec::new();
|
||||
report
|
||||
.pprof()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.write_to_vec(&mut body)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "application/octet-stream")
|
||||
.header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"")
|
||||
.body(Body::from(body))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
|
||||
Format::Svg => {
|
||||
let mut body = Vec::new();
|
||||
report
|
||||
.flamegraph(&mut body)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "image/svg+xml")
|
||||
.body(Body::from(body))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
|
||||
@@ -30,7 +30,7 @@ pub fn parse_request_param<T: FromStr>(
|
||||
}
|
||||
}
|
||||
|
||||
fn get_query_param<'a>(
|
||||
pub fn get_query_param<'a>(
|
||||
request: &'a Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<Option<Cow<'a, str>>, ApiError> {
|
||||
|
||||
@@ -83,7 +83,9 @@ where
|
||||
}
|
||||
wake_these.push(self.heap.pop().unwrap().wake_channel);
|
||||
}
|
||||
self.update_status();
|
||||
if !wake_these.is_empty() {
|
||||
self.update_status();
|
||||
}
|
||||
wake_these
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user