Compare commits

..

3 Commits

Author SHA1 Message Date
Conrad Ludgate
a65a5c372b storage broker 2024-01-18 14:43:29 +00:00
Conrad Ludgate
2cf85471f5 proxy 2024-01-18 14:11:35 +00:00
Conrad Ludgate
665f4ff4b5 move to hyper 1.0 mostly 2024-01-18 12:44:50 +00:00
62 changed files with 1124 additions and 1353 deletions

499
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -51,7 +51,7 @@ async-trait = "0.1"
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.0"
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.0"
aws-smithy-types = { version = "1.1.2", features = ["http-body-1-x"] }
aws-credential-types = "1.0"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
@@ -89,8 +89,12 @@ hostname = "0.3.1"
http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
hyper = "1.0.0"
hyper-util = "0.1.0"
http = "1"
http-body = "1"
http-body-util = "0.1"
hyper-tungstenite = "0.13.0"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -113,7 +117,7 @@ parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
prost = "0.12"
rand = "0.8"
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
@@ -121,7 +125,7 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
routerify = { git = "https://github.com/conradludgate/routerify", branch = "hyper1" }
rpds = "0.13"
rustc-hash = "1.1.0"
rustls = "0.21"
@@ -149,7 +153,7 @@ tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.1"
thiserror = "1.0"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tls-listener = { version = "0.9", features = ["rustls", "tokio-net"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.10.0"
@@ -159,7 +163,7 @@ tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tonic = {version = "0.10", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0"
@@ -211,7 +215,7 @@ criterion = "0.5.1"
rcgen = "0.11"
rstest = "0.18"
camino-tempfile = "1.0.2"
tonic-build = "0.9"
tonic-build = "0.10.2"
[patch.crates-io]

View File

@@ -883,10 +883,8 @@ FROM debian:bullseye-slim
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
# create folder for file cache
mkdir -p -m 777 /neon/cache

View File

@@ -12,7 +12,11 @@ cfg-if.workspace = true
clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
hyper = { workspace = true, features = ["server"] }
hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
http = { workspace = true, features = [] }
http-body = { workspace = true, features = [] }
http-body-util = { workspace = true, features = [] }
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true

View File

@@ -32,6 +32,8 @@
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
//! ```
//!
use std::collections::HashMap;
@@ -110,6 +112,9 @@ fn main() -> Result<()> {
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -220,6 +225,8 @@ fn main() -> Result<()> {
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
};
let compute = Arc::new(compute_node);
@@ -516,6 +523,23 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-connstr")
.long("pgbouncer-connstr")
.default_value(
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
)
.value_name("PGBOUNCER_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-ini-path")
.long("pgbouncer-ini-path")
// Note: this doesn't match current path for pgbouncer.ini.
// Until we fix it, we need to pass the path explicitly
// or this will be effectively no-op.
.default_value("/etc/pgbouncer.ini")
.value_name("PGBOUNCER_INI_PATH"),
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers

View File

@@ -71,6 +71,10 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
// connection string to pgbouncer to change settings
pub pgbouncer_connstr: Option<String>,
// path to pgbouncer.ini to change settings
pub pgbouncer_ini_path: Option<String>,
}
// store some metrics about download size that might impact startup time
@@ -765,8 +769,8 @@ impl ComputeNode {
pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
info!("tuning pgbouncer");
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
@@ -775,9 +779,15 @@ impl ComputeNode {
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let pgbouncer_settings = spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
@@ -842,8 +852,8 @@ impl ComputeNode {
);
// tune pgbouncer
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
info!("tuning pgbouncer");
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
@@ -852,9 +862,15 @@ impl ComputeNode {
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}

View File

@@ -6,14 +6,22 @@ use std::sync::Arc;
use std::thread;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use bytes::Bytes;
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use http_body_util::BodyExt;
use http_body_util::Full;
use hyper::body::Incoming;
use hyper::service::service_fn;
use hyper::{Method, Request, Response, StatusCode};
use hyper_util::rt::TokioExecutor;
use hyper_util::rt::TokioIo;
use hyper_util::server::conn;
use num_cpus;
use serde_json;
use tokio::net::TcpListener;
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;
@@ -36,7 +44,7 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
}
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<Full<Bytes>> {
//
// NOTE: The URI path is currently included in traces. That's OK because
// it doesn't contain any variable parts or sensitive information. But
@@ -48,7 +56,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
info!("serving /status GET request");
let state = compute.state.lock().unwrap();
let status_response = status_response_from_state(&state);
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
Response::new(Full::from(serde_json::to_string(&status_response).unwrap()))
}
// Startup metrics in JSON format. Keep /metrics reserved for a possible
@@ -56,7 +64,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
(&Method::GET, "/metrics.json") => {
info!("serving /metrics.json GET request");
let metrics = compute.state.lock().unwrap().metrics.clone();
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
Response::new(Full::from(serde_json::to_string(&metrics).unwrap()))
}
// Collect Postgres current usage insights
@@ -66,11 +74,11 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
if status != ComputeStatus::Running {
let msg = format!("compute is not running, current status: {:?}", status);
error!(msg);
return Response::new(Body::from(msg));
return Response::new(Full::from(msg));
}
let insights = compute.collect_insights().await;
Response::new(Body::from(insights))
Response::new(Full::from(insights))
}
(&Method::POST, "/check_writability") => {
@@ -82,15 +90,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
status
);
error!(msg);
return Response::new(Body::from(msg));
return Response::new(Full::from(msg));
}
let res = crate::checker::check_writability(compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Ok(_) => Response::new(Full::from("true")),
Err(e) => {
error!("check_writability failed: {}", e);
Response::new(Body::from(e.to_string()))
Response::new(Full::from(e.to_string()))
}
}
}
@@ -98,7 +106,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
(&Method::GET, "/info") => {
let num_cpus = num_cpus::get_physical();
info!("serving /info GET request. num_cpus: {}", num_cpus);
Response::new(Body::from(
Response::new(Full::from(
serde_json::json!({
"num_cpus": num_cpus,
})
@@ -115,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
(&Method::POST, "/configure") => {
info!("serving /configure POST request");
match handle_configure_request(req, compute).await {
Ok(msg) => Response::new(Body::from(msg)),
Ok(msg) => Response::new(Full::from(msg)),
Err((msg, code)) => {
error!("error handling /configure request: {msg}");
render_json_error(&msg, code)
@@ -132,7 +140,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
// if no remote storage is configured
if compute.ext_remote_storage.is_none() {
info!("no extensions remote storage configured");
let mut resp = Response::new(Body::from("no remote storage configured"));
let mut resp = Response::new(Full::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
@@ -143,7 +151,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
if params == "is_library=true" {
is_library = true;
} else {
let mut resp = Response::new(Body::from("Wrong request parameters"));
let mut resp = Response::new(Full::from("Wrong request parameters"));
*resp.status_mut() = StatusCode::BAD_REQUEST;
return resp;
}
@@ -165,7 +173,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
Some(r) => r,
None => {
info!("no remote extensions spec was provided");
let mut resp = Response::new(Body::from("no remote storage configured"));
let mut resp = Response::new(Full::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
@@ -182,10 +190,10 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
match ext {
Ok((ext_name, ext_path)) => {
match compute.download_extension(ext_name, ext_path).await {
Ok(_) => Response::new(Body::from("OK")),
Ok(_) => Response::new(Full::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
let mut resp = Response::new(Full::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
@@ -193,7 +201,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
Err(e) => {
warn!("extension download failed to find extension: {}", e);
let mut resp = Response::new(Body::from("failed to find file"));
let mut resp = Response::new(Full::from("failed to find file"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
@@ -202,7 +210,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));
let mut not_found = Response::new(Full::from("404 Not Found"));
*not_found.status_mut() = StatusCode::NOT_FOUND;
not_found
}
@@ -210,7 +218,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
async fn handle_configure_request(
req: Request<Body>,
req: Request<Incoming>,
compute: &Arc<ComputeNode>,
) -> Result<String, (String, StatusCode)> {
if !compute.live_config_allowed {
@@ -220,7 +228,7 @@ async fn handle_configure_request(
));
}
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
let body_bytes = req.into_body().collect().await.unwrap().to_bytes();
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
let spec = request.spec;
@@ -287,13 +295,13 @@ async fn handle_configure_request(
}
}
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
fn render_json_error(e: &str, status: StatusCode) -> Response<Full<Bytes>> {
let error = GenericAPIError {
error: e.to_string(),
};
Response::builder()
.status(status)
.body(Body::from(serde_json::to_string(&error).unwrap()))
.body(Full::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
@@ -304,35 +312,43 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
// see e.g. https://github.com/rust-lang/rust/pull/34440
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let make_service = make_service_fn(move |_conn| {
let service = service_fn(move |req: Request<Incoming>| {
let state = state.clone();
async move {
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
let state = state.clone();
async move {
Ok::<_, Infallible>(
// NOTE: We include the URI path in the string. It
// doesn't contain any variable parts or sensitive
// information in this API.
tracing_utils::http::tracing_handler(
req,
|req| routes(req, &state),
OtelName::UriPath,
)
.await,
)
}
}))
Ok::<_, Infallible>(
// NOTE: We include the URI path in the string. It
// doesn't contain any variable parts or sensitive
// information in this API.
tracing_utils::http::tracing_handler(
req,
|req| routes(req, &state),
OtelName::UriPath,
)
.await,
)
}
});
info!("starting HTTP server on {}", addr);
let server = Server::bind(&addr).serve(make_service);
// Run this server forever
if let Err(e) = server.await {
error!("server error: {}", e);
let listener = TcpListener::bind(addr).await.unwrap();
loop {
let (stream, _) = match listener.accept().await {
Ok(r) => r,
Err(e) => {
error!("server error: {}", e);
return;
}
};
let io = TokioIo::new(stream);
let service = service.clone();
tokio::task::spawn(async move {
let builder = conn::auto::Builder::new(TokioExecutor::new());
let res = builder.serve_connection(io, service).await;
if let Err(err) = res {
println!("Error serving connection: {:?}", err);
}
});
}
}

View File

@@ -366,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
}
/// Update pgbouncer.ini with provided options
fn update_pgbouncer_ini(
pub fn update_pgbouncer_ini(
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
@@ -375,10 +375,6 @@ fn update_pgbouncer_ini(
for (option_name, value) in pgbouncer_config.iter() {
section.insert(option_name, value);
debug!(
"Updating pgbouncer.ini with new values {}={}",
option_name, value
);
}
conf.write_to_file(pgbouncer_ini_path)?;
@@ -388,79 +384,48 @@ fn update_pgbouncer_ini(
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
// when pgbouncer is running under the same user.
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
} else {
// for k8s use normal connection string with password
// to connect to pgbouncer admin console
let mut pgbouncer_connstr =
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
}
pgbouncer_connstr
};
info!(
"Connecting to pgbouncer with connection string: {}",
pgbouncer_connstr
);
// connect to pgbouncer, retrying several times
// because pgbouncer may not be ready yet
let mut retries = 3;
let client = loop {
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
Ok((client, connection)) => {
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
break client;
pub async fn tune_pgbouncer(
pgbouncer_settings: Option<HashMap<String, String>>,
pgbouncer_connstr: &str,
pgbouncer_ini_path: Option<String>,
) -> Result<()> {
if let Some(pgbouncer_config) = pgbouncer_settings {
// Apply new config
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
let (client, connection) = connect_result.unwrap();
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
Err(e) => {
if retries == 0 {
return Err(e.into());
}
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
retries -= 1;
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
};
});
// Apply new config
for (option_name, value) in pgbouncer_config.iter() {
let query = format!("SET {}={}", option_name, value);
// keep this log line for debugging purposes
info!("Applying pgbouncer setting change: {}", query);
if let Err(err) = client.simple_query(&query).await {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
for (option_name, value) in pgbouncer_config.iter() {
info!(
"Applying pgbouncer setting change: {} = {}",
option_name, value
);
};
}
let query = format!("SET {} = {}", option_name, value);
// save values to pgbouncer.ini
// so that they are preserved after pgbouncer restart
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
// in VMs we use /etc/pgbouncer.ini
"/etc/pgbouncer.ini".to_string()
} else {
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
// this is a shared volume between pgbouncer and postgres containers
// FIXME: fix permissions for this file
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
};
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
let result = client.simple_query(&query).await;
info!("Applying pgbouncer setting change: {}", query);
info!("pgbouncer setting change result: {:?}", result);
if let Err(err) = result {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
// so that they are preserved after pgbouncer restart
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
}
}
Ok(())
}

View File

@@ -1,6 +1,5 @@
use crate::{background_process, local_env::LocalEnv};
use camino::Utf8PathBuf;
use hyper::Method;
use pageserver_api::{
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
shard::TenantShardId,
@@ -8,6 +7,7 @@ use pageserver_api::{
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{path::PathBuf, process::Child, str::FromStr};
use tracing::instrument;
@@ -278,7 +278,7 @@ impl AttachmentService {
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,
method: reqwest::Method,
path: String,
body: Option<RQ>,
) -> anyhow::Result<RS>

View File

@@ -15,7 +15,11 @@ aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino.workspace = true
hyper = { workspace = true, features = ["stream"] }
hyper = { workspace = true, features = [] }
hyper-util = { workspace = true, features = [] }
http = { workspace = true, features = [] }
http-body = { workspace = true, features = [] }
http-body-util = { workspace = true, features = [] }
futures.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -36,7 +36,9 @@ use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use bytes::Bytes;
use futures::stream::Stream;
use hyper::Body;
use futures_util::TryStreamExt;
use http_body::Frame;
use http_body_util::StreamBody;
use scopeguard::ScopeGuard;
use super::StorageMetadata;
@@ -469,8 +471,8 @@ impl RemoteStorage for S3Bucket {
let started_at = start_measuring_requests(kind);
let body = Body::wrap_stream(from);
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
let body = StreamBody::new(from.map_ok(Frame::data));
let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
let res = self
.client

View File

@@ -1,7 +1,8 @@
//! Tracing wrapper for Hyper HTTP server
use hyper::body::Body;
use hyper::HeaderMap;
use hyper::{Body, Request, Response};
use hyper::{Request, Response};
use std::future::Future;
use tracing::Instrument;
use tracing_opentelemetry::OpenTelemetrySpanExt;
@@ -35,14 +36,14 @@ pub enum OtelName<'a> {
/// instrumentation libraries at:
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
/// If a Hyper crate appears, consider switching to that.
pub async fn tracing_handler<F, R>(
req: Request<Body>,
pub async fn tracing_handler<B1: Body, B2: Body, F, R>(
req: Request<B1>,
handler: F,
otel_name: OtelName<'_>,
) -> Response<Body>
) -> Response<B2>
where
F: Fn(Request<Body>) -> R,
R: Future<Output = Response<Body>>,
F: Fn(Request<B1>) -> R,
R: Future<Output = Response<B2>>,
{
// Create a tracing span, with context propagated from the incoming
// request if any.

View File

@@ -22,6 +22,7 @@ chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
hyper = { workspace = true, features = ["full"] }
http-body-util = { workspace = true, features = [] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true

View File

@@ -4,7 +4,10 @@ use crate::http::{
error::ApiError,
json::{json_request, json_response},
};
use hyper::{Body, Request, Response, StatusCode};
use bytes::Bytes;
use http_body_util::Full;
use hyper::{Request, Response, StatusCode};
use routerify::Body;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -151,7 +154,7 @@ pub struct FailpointConfig {
pub async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
) -> Result<Response<Full<Bytes>>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot manage failpoints because storage was compiled without failpoints support"

View File

@@ -4,11 +4,11 @@ use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue;
use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
use hyper::{header::CONTENT_TYPE, Request, Response};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use routerify::{Body, Middleware, RequestInfo, Router, RouterBuilder};
use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
@@ -238,7 +238,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
let (tx, rx) = mpsc::channel(1);
let body = Body::wrap_stream(ReceiverStream::new(rx));
let body = Body::from_stream(ReceiverStream::new(rx));
let mut writer = ChannelWriter::new(128 * 1024, tx);
@@ -284,7 +284,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
Ok(response)
}
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
pub fn add_request_id_middleware<B: hyper::body::Body + Send + Sync + 'static>(
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
@@ -317,7 +317,7 @@ async fn add_request_id_header_to_response(
Ok(res)
}
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
pub fn make_router() -> RouterBuilder<routerify::Body, ApiError> {
Router::builder()
.middleware(add_request_id_middleware())
.middleware(Middleware::post_with_info(
@@ -328,11 +328,11 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
}
pub fn attach_openapi_ui(
router_builder: RouterBuilder<hyper::Body, ApiError>,
router_builder: RouterBuilder<routerify::Body, ApiError>,
spec: &'static [u8],
spec_mount_path: &'static str,
ui_mount_path: &'static str,
) -> RouterBuilder<hyper::Body, ApiError> {
) -> RouterBuilder<routerify::Body, ApiError> {
router_builder
.get(spec_mount_path,
move |r| request_span(r, move |_| async move {
@@ -388,7 +388,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
Ok(token)
}
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
pub fn auth_middleware<B: hyper::body::Body + Send + Sync + 'static>(
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
@@ -423,7 +423,7 @@ pub fn add_response_header_middleware<B>(
value: &str,
) -> anyhow::Result<Middleware<B, ApiError>>
where
B: hyper::body::HttpBody + Send + Sync + 'static,
B: hyper::body::Body + Send + Sync + 'static,
{
let name =
HeaderName::from_str(header).with_context(|| format!("invalid header name: {header}"))?;
@@ -464,7 +464,6 @@ pub fn check_permission_with(
#[cfg(test)]
mod tests {
use super::*;
use futures::future::poll_fn;
use hyper::service::Service;
use routerify::RequestServiceBuilder;
use std::net::{IpAddr, SocketAddr};
@@ -473,16 +472,13 @@ mod tests {
async fn test_request_id_returned() {
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
let mut service = builder.build(remote_addr);
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
panic!("request service is not ready: {:?}", e);
}
let service = builder.build(remote_addr);
let mut req: Request<Body> = Request::default();
req.headers_mut()
.append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
let resp: Response<Body> = service.call(req).await.unwrap();
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
@@ -493,13 +489,10 @@ mod tests {
async fn test_request_id_empty() {
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
let mut service = builder.build(remote_addr);
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
panic!("request service is not ready: {:?}", e);
}
let service = builder.build(remote_addr);
let req: Request<Body> = Request::default();
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
let resp: Response<Body> = service.call(req).await.unwrap();
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);

View File

@@ -1,4 +1,5 @@
use hyper::{header, Body, Response, StatusCode};
use hyper::{header, Response, StatusCode};
use routerify::Body;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::error::Error as StdError;

View File

@@ -1,6 +1,8 @@
use anyhow::Context;
use bytes::Buf;
use hyper::{header, Body, Request, Response, StatusCode};
use anyhow::{anyhow, Context};
use bytes::{Buf, Bytes};
use http_body_util::{BodyExt, Full};
use hyper::{header, Request, Response, StatusCode};
use routerify::Body;
use serde::{Deserialize, Serialize};
use super::error::ApiError;
@@ -18,10 +20,14 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>,
) -> Result<Option<T>, ApiError> {
let body = hyper::body::aggregate(request.body_mut())
let body = request
.body_mut()
.collect()
.await
.map_err(|e| anyhow!(e))
.context("Failed to read request body")
.map_err(ApiError::BadRequest)?;
.map_err(ApiError::BadRequest)?
.aggregate();
if body.remaining() == 0 {
return Ok(None);
}
@@ -35,17 +41,24 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
.map_err(ApiError::BadRequest)
}
pub fn json_response<T: Serialize>(
pub fn json_response_body<T: Serialize>(
status: StatusCode,
data: T,
) -> Result<Response<Body>, ApiError> {
json_response(status, data).map(|r| r.map(Body::new))
}
pub fn json_response<T: Serialize>(
status: StatusCode,
data: T,
) -> Result<Response<Full<Bytes>>, ApiError> {
let json = serde_json::to_string(&data)
.context("Failed to serialize JSON response")
.map_err(ApiError::InternalServerError)?;
let response = Response::builder()
.status(status)
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(json))
.body(Full::from(json))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
}

View File

@@ -5,4 +5,4 @@ pub mod request;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
pub use routerify::{ext::RequestExt, Body, RequestServiceBuilder, RouterBuilder};

View File

@@ -3,8 +3,9 @@ use std::{borrow::Cow, str::FromStr};
use super::error::ApiError;
use anyhow::anyhow;
use hyper::{body::HttpBody, Body, Request};
use routerify::ext::RequestExt;
use http_body_util::BodyExt;
use hyper::Request;
use routerify::{ext::RequestExt, Body};
pub fn get_request_param<'a>(
request: &'a Request<Body>,
@@ -75,7 +76,7 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
}
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
match request.body_mut().data().await {
match request.body_mut().frame().await {
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
None => Ok(()),
}

View File

@@ -108,32 +108,9 @@ pub struct RelTagBlockNo {
}
impl PagestreamClient {
pub async fn shutdown(self) {
let Self {
copy_both,
cancel_on_client_drop: cancel_conn_task,
conn_task,
} = self;
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
//
// If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
// the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
//
// Further, the pageserver makes a lot of noise when it receives CopyFail.
// Computes don't send it in practice, they just hard-close the connection.
//
// So, let's behave like the computes and suppress the CopyFail as follows:
// kill the socket first, then drop copy_both.
//
// See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
//
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
// => https://github.com/neondatabase/neon/issues/6390
let _ = cancel_conn_task.unwrap();
conn_task.await.unwrap();
drop(copy_both);
pub async fn shutdown(mut self) {
let _ = self.cancel_on_client_drop.take();
self.conn_task.await.unwrap();
}
pub async fn getpage(

View File

@@ -404,27 +404,23 @@ async fn client(
.await
.unwrap();
let do_requests = async {
start_work_barrier.wait().await;
while let Some(req) = work.recv().await {
let start = Instant::now();
client
.getpage(req)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
};
tokio::select! {
res = do_requests => { res },
_ = cancel.cancelled() => {
client.shutdown().await;
return;
}
start_work_barrier.wait().await;
while let Some(req) =
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
{
let start = Instant::now();
let res = tokio::select! {
res = client.getpage(req) => { res },
_ = cancel.cancelled() => { return; }
};
res.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
}

View File

@@ -35,7 +35,6 @@ fn main() {
logging::Output::Stderr,
)
.unwrap();
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let args = Args::parse();
match args {

View File

@@ -386,56 +386,39 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut warned = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
let selection = select_victims(&candidates, usage_pre);
let mut candidates = candidates;
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
// for comparison here. this is a temporary measure to develop alternatives.
use std::fmt::Write;
let mut summary_buf = String::with_capacity(256);
{
let absolute_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{absolute_summary}").expect("string grows");
info!("absolute accessed selection summary: {summary_buf}");
for (i, (partition, candidate)) in candidates.iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
"took enough candidates for pressure to be relieved"
);
break;
}
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
let selection = select_victims(&candidates, usage_pre);
{
summary_buf.clear();
let relative_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{relative_summary}").expect("string grows");
info!("relative accessed selection summary: {summary_buf}");
if partition == &MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
selection
} else {
selection
usage_planned.add_available_bytes(candidate.layer.get_file_size());
evicted_amount += 1;
}
let usage_planned = match warned {
Some(respecting_tenant_min_resident_size) => PlannedUsage {
respecting_tenant_min_resident_size,
fallback_to_global_lru: Some(usage_planned),
},
None => PlannedUsage {
respecting_tenant_min_resident_size: usage_planned,
fallback_to_global_lru: None,
},
};
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
debug!(?usage_planned, "usage planned");
// phase2: evict layers
@@ -927,80 +910,22 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
// will sort later by candidate.relative_last_activity to get compare evictions.
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
match eviction_order {
EvictionOrder::AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
EvictionOrder::RelativeAccessed { .. } => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
}
}
Ok(EvictionCandidates::Finished(candidates))
}
/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
/// relieve pressure.
///
/// Returns the amount of candidates selected, with the planned usage.
fn select_victims<U: Usage>(
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
usage_pre: U,
) -> VictimSelection<U> {
let mut usage_when_switched = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
if !usage_planned.has_pressure() {
break;
}
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
usage_when_switched = Some((usage_planned, i));
}
usage_planned.add_available_bytes(candidate.layer.get_file_size());
evicted_amount += 1;
}
VictimSelection {
amount: evicted_amount,
usage_pre,
usage_when_switched,
usage_planned,
}
}
struct VictimSelection<U> {
amount: usize,
usage_pre: U,
usage_when_switched: Option<(U, usize)>,
usage_planned: U,
}
impl<U: Usage> VictimSelection<U> {
fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
debug!(
evicted_amount=%self.amount,
"took enough candidates for pressure to be relieved"
);
if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
}
let planned = match self.usage_when_switched {
Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
respecting_tenant_min_resident_size,
fallback_to_global_lru: Some(self.usage_planned),
},
None => PlannedUsage {
respecting_tenant_min_resident_size: self.usage_planned,
fallback_to_global_lru: None,
},
};
(self.amount, planned)
}
}
struct TimelineKey(Arc<Timeline>);
impl PartialEq for TimelineKey {
@@ -1085,137 +1010,6 @@ pub(crate) mod finite_f32 {
}
}
mod summary {
use super::finite_f32::FiniteF32;
use super::{EvictionCandidate, LayerCount};
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use std::time::SystemTime;
#[derive(Debug, Default)]
pub(super) struct EvictionSummary {
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
total: LayerCount,
last_absolute: Option<SystemTime>,
last_relative: Option<FiniteF32>,
}
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
let mut summary = EvictionSummary::default();
for item in iter {
let counts = summary
.evicted_per_tenant
.entry(*item.layer.get_tenant_shard_id())
.or_default();
let sz = item.layer.get_file_size();
counts.file_sizes += sz;
counts.count += 1;
summary.total.file_sizes += sz;
summary.total.count += 1;
summary.last_absolute = Some(item.last_activity_ts);
summary.last_relative = Some(item.relative_last_activity);
}
summary
}
}
struct SiBytesAmount(u64);
impl std::fmt::Display for SiBytesAmount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 < 1024 {
return write!(f, "{}B", self.0);
}
let mut tmp = self.0;
let mut ch = 0;
let suffixes = b"KMGTPE";
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
tmp /= 1024;
ch += 1;
}
let ch = suffixes[ch] as char;
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
}
}
impl std::fmt::Display for EvictionSummary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// wasteful, but it's for testing
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
for (tenant_shard_id, count) in &self.evicted_per_tenant {
sorted
.entry(count.count)
.or_default()
.push((*tenant_shard_id, count.file_sizes));
}
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
writeln!(
f,
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
self.total.count, self.last_absolute, self.last_relative,
)?;
for (count, per_tenant) in sorted.iter().rev().take(10) {
write!(f, "- {count} layers: ")?;
if per_tenant.len() < 3 {
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
let bytes = SiBytesAmount(*bytes);
write!(f, "{tenant_shard_id} ({bytes})")?;
}
} else {
let num_tenants = per_tenant.len();
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
let total_bytes = SiBytesAmount(total_bytes);
let layers = num_tenants * count;
write!(
f,
"{num_tenants} tenants {total_bytes} in total {layers} layers",
)?;
}
writeln!(f)?;
}
if sorted.len() > 10 {
let (rem_count, rem_bytes) = sorted
.iter()
.rev()
.map(|(count, per_tenant)| {
(
count,
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
)
})
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
let rem_bytes = SiBytesAmount(rem_bytes);
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
}
Ok(())
}
}
}
mod filesystem_level_usage {
use anyhow::Context;
use camino::Utf8Path;

View File

@@ -12,7 +12,7 @@ use futures::TryFutureExt;
use humantime::format_rfc3339;
use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use hyper::{Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::ShardParameters;
@@ -32,6 +32,7 @@ use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use utils::http::Body;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
@@ -64,7 +65,7 @@ use utils::{
http::{
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
error::{ApiError, HttpErrorBody},
json::{json_request, json_response},
json::{json_request, json_response_body as json_response},
request::parse_request_param,
RequestExt, RouterBuilder,
},
@@ -1571,7 +1572,7 @@ async fn getpage_at_lsn_handler(
Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.body(Body::from(page))
.unwrap(),
)
}
@@ -1868,7 +1869,7 @@ pub fn make_router(
state: Arc<State>,
launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<SwappableJwtAuth>>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
) -> anyhow::Result<RouterBuilder<Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
if auth.is_some() {

View File

@@ -384,17 +384,11 @@ impl PageServerHandler {
}
}
/// Future that completes when we need to shut down the connection.
/// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
///
/// Reasons for need to shut down are:
/// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
/// - task_mgr requests shutdown of the connection
///
/// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
/// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
///
/// NB: keep in sync with [`Self::is_connection_cancelled`]
async fn await_connection_cancelled(&self) {
/// We use many Timeline objects, and hold GateGuards on all of them. We must therefore respect
/// all of their cancellation tokens.
async fn timeline_cancelled(&self) {
// A short wait before we expend the cycles to walk our timeline map. This avoids incurring
// that cost every time we check for cancellation.
tokio::time::sleep(Duration::from_millis(10)).await;
@@ -410,19 +404,14 @@ impl PageServerHandler {
.map(|ht| ht.timeline.cancel.cancelled())
.collect::<FuturesUnordered<_>>();
tokio::select! {
_ = task_mgr::shutdown_watcher() => { }
_ = futs.next() => {}
}
futs.next().await;
}
/// Checking variant of [`Self::await_connection_cancelled`].
fn is_connection_cancelled(&self) -> bool {
task_mgr::is_shutdown_requested()
|| self
.shard_timelines
.values()
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
/// Analogous to calling is_cancelled() on a Timeline's cancellation token
fn timeline_is_cancelled(&self) -> bool {
self.shard_timelines
.values()
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
}
/// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in
@@ -443,7 +432,7 @@ impl PageServerHandler {
flush_r = pgb.flush() => {
Ok(flush_r?)
},
_ = self.await_connection_cancelled() => {
_ = self.timeline_cancelled() => {
Err(QueryError::Shutdown)
}
_ = cancel.cancelled() => {
@@ -560,7 +549,7 @@ impl PageServerHandler {
let msg = tokio::select! {
biased;
_ = self.await_connection_cancelled() => {
_ = self.timeline_cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
@@ -643,7 +632,7 @@ impl PageServerHandler {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
Err(e) if self.is_connection_cancelled() => {
Err(e) if self.timeline_is_cancelled() => {
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
//

View File

@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
neon_log(LOG, "Failed to punch hole in file: %m");
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
}
lfc_ctl->limit = new_size;
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
elog(DEBUG1, "set local file cache limit to %d", new_size);
LWLockRelease(lfc_lock);
}
@@ -327,7 +327,7 @@ lfc_init(void)
* shared_preload_libraries.
*/
if (!process_shared_preload_libraries_in_progress)
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomIntVariable("neon.max_file_cache_size",
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
elog(DEBUG2, "Swap file cache page");
}
else
{
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
* wrong) function definition though.
*/
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
neon_log(ERROR, "return type must be a row type");
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
neon_log(ERROR, "incorrect number of output arguments");
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);

View File

@@ -990,7 +990,7 @@ nm_pack_request(NeonRequest *msg)
case T_NeonErrorResponse:
case T_NeonDbSizeResponse:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
break;
}
return s;
@@ -1085,7 +1085,7 @@ nm_unpack_response(StringInfo s)
case T_NeonGetPageRequest:
case T_NeonDbSizeRequest:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
elog(ERROR, "unexpected neon message tag 0x%02x", tag);
break;
}
@@ -1277,7 +1277,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
XLogFlush(recptr);
lsn = recptr;
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1305,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
if (PageIsNew((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1313,7 +1313,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else if (PageIsEmptyHeapPage((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1321,7 +1321,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(PANIC,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1330,7 +1330,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1430,7 +1430,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
lsn = nm_adjust_lsn(lsn);
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
}
else
@@ -1445,7 +1445,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
*latest = true;
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
Assert(lsn != InvalidXLogRecPtr);
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
lsn = nm_adjust_lsn(lsn);
@@ -1465,7 +1465,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
#endif
if (lsn > flushlsn)
{
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
(uint32) (lsn >> 32), (uint32) lsn,
(uint32) (flushlsn >> 32), (uint32) flushlsn);
XLogFlush(lsn);
@@ -1509,7 +1509,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return mdexists(reln, forkNum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
@@ -1561,7 +1561,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -1570,7 +1570,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);
return exists;
@@ -1587,7 +1587,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1598,10 +1598,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum);
@@ -1696,7 +1696,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1707,7 +1707,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/*
@@ -1745,7 +1745,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -1785,7 +1785,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1796,7 +1796,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (max_cluster_size > 0 &&
@@ -1808,7 +1808,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,
(errcode(ERRCODE_DISK_FULL),
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
max_cluster_size),
errhint("This limit is defined by neon.max_cluster_size GUC")));
}
@@ -1821,7 +1821,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
@@ -1882,7 +1882,7 @@ neon_open(SMgrRelation reln)
mdopen(reln);
/* no work */
neon_log(SmgrTrace, "open noop");
elog(SmgrTrace, "[NEON_SMGR] open noop");
}
/*
@@ -1919,7 +1919,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
return mdprefetch(reln, forknum, blocknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
@@ -1964,11 +1964,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* not implemented */
neon_log(SmgrTrace, "writeback noop");
elog(SmgrTrace, "[NEON_SMGR] writeback noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2098,7 +2098,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
blkno,
RelFileInfoFmt(rinfo),
forkNum,
@@ -2107,7 +2107,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
((NeonErrorResponse *) resp)->message)));
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
/* buffer was used, clean up for later reuse */
@@ -2131,7 +2131,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -2142,7 +2142,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* Try to read from local file cache */
@@ -2170,7 +2170,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
{
if (!PageIsNew((Page) pageserver_masked))
{
neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2180,7 +2180,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
}
else if (PageIsNew((Page) buffer))
{
neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2195,7 +2195,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2214,7 +2214,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2294,13 +2294,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_wallog_page(reln, forknum, blocknum, buffer, false);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -2327,7 +2327,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2338,12 +2338,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
return mdnblocks(reln, forknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
@@ -2371,7 +2371,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2380,11 +2380,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
@@ -2427,7 +2427,7 @@ neon_dbsize(Oid dbNode)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
@@ -2435,10 +2435,10 @@ neon_dbsize(Oid dbNode)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
db_size);
@@ -2458,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2470,7 +2470,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
@@ -2526,7 +2526,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2538,10 +2538,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2566,17 +2566,17 @@ neon_start_unlogged_build(SMgrRelation reln)
* progress at a time. That's enough for the current usage.
*/
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
neon_log(ERROR, "unlogged relation build is already in progress");
elog(ERROR, "unlogged relation build is already in progress");
Assert(unlogged_build_rel == NULL);
ereport(SmgrTrace,
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
(errmsg("starting unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2589,11 +2589,11 @@ neon_start_unlogged_build(SMgrRelation reln)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
elog(ERROR, "cannot perform unlogged index build, index is not empty ");
unlogged_build_rel = reln;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
@@ -2620,7 +2620,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2649,7 +2649,7 @@ neon_end_unlogged_build(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
(errmsg("ending unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2664,7 +2664,7 @@ neon_end_unlogged_build(SMgrRelation reln)
rinfob = InfoBFromSMgrRel(reln);
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
@@ -2707,7 +2707,7 @@ AtEOXact_neon(XactEvent event, void *arg)
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
(errmsg("unlogged index build was not properly finished"))));
}
break;
}
@@ -2806,14 +2806,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
set_cached_relsize(rinfo, forknum, relsize);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
neon_log(SmgrTrace, "Set length to %d", relsize);
elog(SmgrTrace, "Set length to %d", relsize);
}
}
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
/*
* TODO: May be it is better to make correspondent function from freespace.c public?
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
*/
static BlockNumber
get_fsm_physical_block(BlockNumber heapblk)
@@ -2894,7 +2894,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
#if PG_VERSION_NUM < 150000
if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
elog(PANIC, "failed to locate backup block with ID %d", block_id);
#else
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
#endif

View File

@@ -959,8 +959,8 @@ DetermineEpochStartLsn(WalProposer *wp)
}
/*
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
* and nothing was committed yet. Start streaming then from the basebackup LSN.
* If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
* was committed yet. Start streaming then from the basebackup LSN.
*/
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
{
@@ -973,13 +973,12 @@ DetermineEpochStartLsn(WalProposer *wp)
}
/*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
* should never be zero at this point, if we know timelineStartLsn.
*
* timelineStartLsn can be zero only on the first syncSafekeepers run.
* If propEpochStartLsn is not 0, at least one msg with WAL was sent to
* some connected safekeeper; it must have carried truncateLsn pointing to
* the first record.
*/
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
(wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn));
/*
* We will be generating WAL since propEpochStartLsn, so we should set

View File

@@ -28,7 +28,11 @@ hmac.workspace = true
hostname.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
hyper = { workspace = true, features = ["server"] }
hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
http = { workspace = true, features = [] }
http-body = { workspace = true, features = [] }
http-body-util = { workspace = true, features = [] }
ipnet.workspace = true
itertools.workspace = true
md5.workspace = true
@@ -89,4 +93,3 @@ camino-tempfile.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true
walkdir.workspace = true

View File

@@ -32,7 +32,6 @@ pub struct RequestMonitoring {
user: Option<SmolStr>,
application: Option<SmolStr>,
error_kind: Option<ErrorKind>,
success: bool,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -60,7 +59,6 @@ impl RequestMonitoring {
user: None,
application: None,
error_kind: None,
success: false,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
@@ -98,10 +96,6 @@ impl RequestMonitoring {
self.user = Some(user);
}
pub fn set_success(&mut self) {
self.success = true;
}
pub fn log(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());

View File

@@ -1,8 +1,7 @@
use std::{sync::Arc, time::SystemTime};
use std::sync::Arc;
use anyhow::Context;
use bytes::BytesMut;
use chrono::{Datelike, Timelike};
use futures::{Stream, StreamExt};
use parquet::{
basic::Compression,
@@ -87,12 +86,6 @@ struct RequestData {
project: Option<String>,
branch: Option<String>,
error: Option<&'static str>,
/// Success is counted if we form a HTTP response with sql rows inside
/// Or if we make it to proxy_pass
success: bool,
/// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure
duration_us: u64,
}
impl From<RequestMonitoring> for RequestData {
@@ -109,11 +102,6 @@ impl From<RequestMonitoring> for RequestData {
protocol: value.protocol,
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_str()),
success: value.success,
duration_us: SystemTime::from(value.first_packet)
.elapsed()
.unwrap_or_default()
.as_micros() as u64, // 584 millenia... good enough
}
}
}
@@ -278,13 +266,7 @@ async fn upload_parquet(
let compression = len as f64 / len_uncompressed as f64;
let size = data.len();
let now = chrono::Utc::now();
let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix(
uuid::NoContext,
// we won't be running this in 1970. this cast is ok
now.timestamp() as u64,
now.timestamp_subsec_nanos(),
));
let id = uuid::Uuid::now_v7();
info!(
%id,
@@ -292,14 +274,7 @@ async fn upload_parquet(
size, compression, "uploading request parquet file"
);
let year = now.year();
let month = now.month();
let day = now.day();
let hour = now.hour();
// segment files by time for S3 performance
let path = RemotePath::from_string(&format!(
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
))?;
let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
backoff::retry(
|| async {
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
@@ -357,7 +332,6 @@ mod tests {
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use tokio::{sync::mpsc, time};
use walkdir::WalkDir;
use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
@@ -446,8 +420,6 @@ mod tests {
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
region: "us-east-1",
error: None,
success: rng.gen(),
duration_us: rng.gen_range(0..30_000_000),
}
}
@@ -470,11 +442,9 @@ mod tests {
worker_inner(storage, rx, config).await.unwrap();
let mut files = WalkDir::new(tmpdir.as_std_path())
.into_iter()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.map(|entry| entry.path().to_path_buf())
let mut files = std::fs::read_dir(tmpdir.as_std_path())
.unwrap()
.map(|entry| entry.unwrap().path())
.collect_vec();
files.sort();
@@ -515,15 +485,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1087635, 3, 6000),
(1087288, 3, 6000),
(1087444, 3, 6000),
(1087572, 3, 6000),
(1087468, 3, 6000),
(1087500, 3, 6000),
(1087533, 3, 6000),
(1087566, 3, 6000),
(362671, 1, 2000)
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
@@ -553,11 +523,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1028637, 5, 10000),
(1031969, 5, 10000),
(1019900, 5, 10000),
(1020365, 5, 10000),
(1025010, 5, 10000)
(1166201, 6, 12000),
(1163577, 6, 12000),
(1164641, 6, 12000),
(1168772, 6, 12000),
(196761, 1, 2000)
],
);
@@ -589,11 +559,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1210770, 6, 12000),
(1211036, 6, 12000),
(1210990, 6, 12000),
(1210861, 6, 12000),
(202073, 1, 2000)
(1144934, 6, 12000),
(1144941, 6, 12000),
(1144735, 6, 12000),
(1144936, 6, 12000),
(191035, 1, 2000)
],
);
@@ -618,15 +588,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1087635, 3, 6000),
(1087288, 3, 6000),
(1087444, 3, 6000),
(1087572, 3, 6000),
(1087468, 3, 6000),
(1087500, 3, 6000),
(1087533, 3, 6000),
(1087566, 3, 6000),
(362671, 1, 2000)
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
@@ -663,7 +633,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
[(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
);
tmpdir.close().unwrap();

View File

@@ -4,14 +4,12 @@
pub mod health_server;
use std::{sync::Arc, time::Duration};
use std::time::Duration;
use futures::FutureExt;
pub use reqwest::{Request, Response, StatusCode};
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::Instant;
use tracing::trace;
use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
use reqwest_middleware::RequestBuilder;
@@ -21,8 +19,6 @@ use reqwest_middleware::RequestBuilder;
/// We deliberately don't want to replace this with a public static.
pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
let client = reqwest::ClientBuilder::new()
.dns_resolver(Arc::new(GaiResolver::default()))
.connection_verbose(true)
.build()
.expect("Failed to create http client");
@@ -34,8 +30,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
let timeout_client = reqwest::ClientBuilder::new()
.dns_resolver(Arc::new(GaiResolver::default()))
.connection_verbose(true)
.timeout(default_timout)
.build()
.expect("Failed to create http client with timeout");
@@ -100,37 +94,6 @@ impl Endpoint {
}
}
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
use hyper::{
client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
service::Service,
};
use reqwest::dns::{Addrs, Resolve, Resolving};
#[derive(Debug)]
pub struct GaiResolver(HyperGaiResolver);
impl Default for GaiResolver {
fn default() -> Self {
Self(HyperGaiResolver::new())
}
}
impl Resolve for GaiResolver {
fn resolve(&self, name: Name) -> Resolving {
let this = &mut self.0.clone();
let start = Instant::now();
Box::pin(
Service::<Name>::call(this, name.clone()).map(move |result| {
let resolve_duration = start.elapsed();
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
result
.map(|addrs| -> Addrs { Box::new(addrs) })
.map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
}),
)
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -1,14 +1,21 @@
use anyhow::{anyhow, bail};
use hyper::{Body, Request, Response, StatusCode};
use anyhow::anyhow;
use http::{Request, Response};
use hyper::StatusCode;
use hyper_util::{
rt::{TokioExecutor, TokioIo},
server::conn,
};
use std::{convert::Infallible, net::TcpListener};
use tracing::info;
use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
use utils::http::{
endpoint, error::ApiError, json::json_response, Body, RequestServiceBuilder, RouterBuilder,
};
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, "")
json_response(StatusCode::OK, "").map(|req| req.map(Body::new))
}
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
fn make_router() -> RouterBuilder<Body, ApiError> {
endpoint::make_router().get("/v1/status", status_handler)
}
@@ -17,11 +24,20 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
info!("http has shut down");
}
let service = || RouterService::new(make_router().build()?);
let router = make_router().build().map_err(|e| anyhow!(e))?;
let builder = RequestServiceBuilder::new(router).map_err(|e| anyhow!(e))?;
let listener = tokio::net::TcpListener::from_std(http_listener)?;
hyper::Server::from_tcp(http_listener)?
.serve(service().map_err(|e| anyhow!(e))?)
.await?;
bail!("hyper server without shutdown handling cannot shutdown successfully");
loop {
let (stream, remote_addr) = listener.accept().await.unwrap();
let io = TokioIo::new(stream);
let service = builder.build(remote_addr);
tokio::task::spawn(async move {
let builder = conn::auto::Builder::new(TokioExecutor::new());
let res = builder.serve_connection(io, service).await;
if let Err(err) = res {
println!("Error serving connection: {:?}", err);
}
});
}
}

View File

@@ -10,13 +10,15 @@ use std::{
};
use bytes::{Buf, BytesMut};
use hyper::server::conn::{AddrIncoming, AddrStream};
use pin_project_lite::pin_project;
use tls_listener::AsyncAccept;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use tokio::{
io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf},
net::{TcpListener, TcpStream},
};
pub struct ProxyProtocolAccept {
pub incoming: AddrIncoming,
pub incoming: TcpListener,
}
pin_project! {
@@ -327,20 +329,18 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
}
impl AsyncAccept for ProxyProtocolAccept {
type Connection = WithClientIp<AddrStream>;
type Connection = WithClientIp<TcpStream>;
type Address = SocketAddr;
type Error = io::Error;
fn poll_accept(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
let Some(conn) = conn else {
return Poll::Ready(None);
};
Poll::Ready(Some(Ok(WithClientIp::new(conn))))
) -> Poll<Result<(Self::Connection, Self::Address), Self::Error>> {
Pin::new(&mut self.incoming)
.poll_accept(cx)
.map_ok(|(c, a)| (WithClientIp::new(c), a))
}
}

View File

@@ -356,7 +356,6 @@ pub async fn proxy_pass(
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
ctx.set_success();
ctx.log();
let usage = USAGE_METRICS.register(Ids {

View File

@@ -7,8 +7,8 @@ use crate::{
proxy::retry::{retry_after, ShouldRetry},
};
use async_trait::async_trait;
use hyper::StatusCode;
use pq_proto::StartupMessageParams;
use reqwest::StatusCode;
use std::ops::ControlFlow;
use tokio::time;
use tracing::{error, info, warn};

View File

@@ -46,11 +46,14 @@ enum Notification {
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct AllowedIpsUpdate {
#[serde(rename = "project")]
project_id: SmolStr,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct PasswordUpdate {
#[serde(rename = "project")]
project_id: SmolStr,
#[serde(rename = "role")]
role_name: SmolStr,
}
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -148,7 +151,7 @@ mod tests {
#[test]
fn parse_allowed_ips() -> anyhow::Result<()> {
let project_id = "new_project".to_string();
let data = format!("{{\"project_id\": \"{project_id}\"}}");
let data = format!("{{\"project\": \"{project_id}\"}}");
let text = json!({
"type": "message",
"topic": "/allowed_ips_updated",
@@ -174,7 +177,7 @@ mod tests {
fn parse_password_updated() -> anyhow::Result<()> {
let project_id = "new_project".to_string();
let role_name = "new_role".to_string();
let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
let data = format!("{{\"project\": \"{project_id}\", \"role\": \"{role_name}\"}}");
let text = json!({
"type": "message",
"topic": "/password_updated",

View File

@@ -6,41 +6,54 @@ mod conn_pool;
mod sql_over_http;
mod websocket;
use bytes::Bytes;
pub use conn_pool::GlobalConnPoolOptions;
use anyhow::bail;
use http_body_util::Full;
use hyper::body::Incoming;
use hyper::StatusCode;
use hyper_util::rt::{TokioExecutor, TokioIo};
use hyper_util::server::conn;
use metrics::IntCounterPairGuard;
use rand::rngs::StdRng;
use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::select;
use tokio_rustls::TlsAcceptor;
use tokio_util::task::TaskTracker;
use crate::config::TlsConfig;
use crate::context::RequestMonitoring;
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
use crate::protocol2::ProxyProtocolAccept;
use crate::rate_limiter::EndpointRateLimiter;
use crate::{cancellation::CancelMap, config::ProxyConfig};
use futures::StreamExt;
use hyper::{
server::{
accept,
conn::{AddrIncoming, AddrStream},
},
Body, Method, Request, Response,
};
use hyper::{Method, Request, Response};
use std::net::IpAddr;
use std::task::Poll;
use std::{future::ready, sync::Arc};
use tls_listener::TlsListener;
use std::pin::pin;
use std::sync::Arc;
use tls_listener::{AsyncTls, TlsListener};
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
#[derive(Clone)]
struct Tls(TlsAcceptor);
impl<C: AsyncRead + AsyncWrite + Unpin> AsyncTls<C> for Tls {
type Stream = tokio_rustls::server::TlsStream<C>;
type Error = std::io::Error;
type AcceptFuture = tokio_rustls::Accept<C>;
fn accept(&self, conn: C) -> Self::AcceptFuture {
tokio_rustls::TlsAcceptor::accept(&self.0, conn)
}
}
pub async fn task_main(
config: &'static ProxyConfig,
ws_listener: TcpListener,
@@ -79,42 +92,52 @@ pub async fn task_main(
};
let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let _ = addr_incoming.set_nodelay(true);
// let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
// let _ = addr_incoming.set_nodelay(true);
let addr_incoming = ProxyProtocolAccept {
incoming: addr_incoming,
incoming: ws_listener,
};
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
let ws_connections2 = ws_connections.clone();
ws_connections.close(); // allows `ws_connections.wait to complete`
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
if let Err(err) = conn {
error!("failed to accept TLS connection for websockets: {err:?}");
ready(false)
} else {
ready(true)
}
});
let mut tls_listener = TlsListener::new(Tls(tls_acceptor), addr_incoming);
let make_svc = hyper::service::make_service_fn(
|stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
tokio::spawn(async move {
loop {
let (stream, remote_addr) = select! {
res = tls_listener.accept() => {
match res {
Err(err) =>
{error!("failed to accept TLS connection for websockets: {err:?}"); continue},
Ok(s) => s,
}
}
_ = cancellation_token.cancelled() => break,
};
let (io, tls) = stream.get_ref();
let client_addr = io.client_addr();
let remote_addr = io.inner.remote_addr();
let sni_name = tls.server_name().map(|s| s.to_string());
let conn_pool = conn_pool.clone();
let ws_connections = ws_connections.clone();
let ws_connections = ws_connections2.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
async move {
let peer_addr = match client_addr {
Some(addr) => addr,
None if config.require_client_ip => bail!("missing required client ip"),
None => remote_addr,
};
Ok(MetricService::new(hyper::service::service_fn(
move |req: Request<Body>| {
let peer_addr = match client_addr {
Some(addr) => addr,
None if config.require_client_ip => {
tracing::error!("Error serving connection: missing required client ip");
continue;
}
None => remote_addr,
};
let io = TokioIo::new(stream);
let cancellation_token = cancellation_token.clone();
tokio::task::spawn(async move {
let service = MetricService::new(hyper::service::service_fn(
move |req: Request<Incoming>| {
let sni_name = sni_name.clone();
let conn_pool = conn_pool.clone();
let ws_connections = ws_connections.clone();
@@ -144,15 +167,22 @@ pub async fn task_main(
.await
}
},
)))
}
},
);
hyper::Server::builder(accept::from_stream(tls_listener))
.serve(make_svc)
.with_graceful_shutdown(cancellation_token.cancelled())
.await?;
));
let builder = conn::auto::Builder::new(TokioExecutor::new());
let mut conn = pin!(builder.serve_connection(io, service));
let res = select! {
_ = cancellation_token.cancelled() => {
conn.as_mut().graceful_shutdown();
conn.await
}
res = conn.as_mut() => res,
};
if let Err(err) = res {
tracing::error!("Error serving connection: {:?}", err);
}
});
}
});
// await websocket connections
ws_connections.wait().await;
@@ -184,18 +214,14 @@ where
type Error = S::Error;
type Future = S::Future;
fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
self.inner.poll_ready(cx)
}
fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
fn call(&self, req: Request<ReqBody>) -> Self::Future {
self.inner.call(req)
}
}
#[allow(clippy::too_many_arguments)]
async fn request_handler(
mut request: Request<Body>,
mut request: Request<Incoming>,
config: &'static ProxyConfig,
tls: &'static TlsConfig,
conn_pool: Arc<conn_pool::GlobalConnPool>,
@@ -205,7 +231,7 @@ async fn request_handler(
sni_hostname: Option<String>,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Result<Response<Body>, ApiError> {
) -> Result<Response<Full<Bytes>>, ApiError> {
let host = request
.headers()
.get("host")
@@ -264,7 +290,7 @@ async fn request_handler(
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.body(Full::new(Bytes::new()))
.map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")

View File

@@ -26,7 +26,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
console::{self, messages::MetricsAuxInfo},
console,
context::RequestMonitoring,
metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::connect_compute::ConnectMechanism,
@@ -362,7 +362,6 @@ impl GlobalConnPool {
// ok return cached connection if found and establish a new one otherwise
let new_client = if let Some(client) = client {
ctx.set_project(client.aux.clone());
if client.inner.is_closed() {
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
@@ -594,6 +593,10 @@ async fn connect_to_compute_once(
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
});
let ids = Ids {
endpoint_id: node_info.aux.endpoint_id.clone(),
branch_id: node_info.aux.branch_id.clone(),
};
let db_user = conn_info.db_and_user();
tokio::spawn(
@@ -661,7 +664,7 @@ async fn connect_to_compute_once(
Ok(ClientInner {
inner: client,
session: tx,
aux: node_info.aux.clone(),
ids,
conn_id,
})
}
@@ -669,17 +672,13 @@ async fn connect_to_compute_once(
struct ClientInner {
inner: tokio_postgres::Client,
session: tokio::sync::watch::Sender<uuid::Uuid>,
aux: MetricsAuxInfo,
ids: Ids,
conn_id: uuid::Uuid,
}
impl Client {
pub fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
})
USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
}
}

View File

@@ -1,15 +1,20 @@
use std::sync::Arc;
use anyhow::bail;
use bytes::Buf;
use bytes::Bytes;
use futures::pin_mut;
use futures::StreamExt;
use hyper::body::HttpBody;
use http_body::Body;
use http_body_util::BodyExt;
use http_body_util::Full;
use hyper::body::Incoming;
use hyper::header;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::Response;
use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use hyper::{HeaderMap, Request};
use serde_json::json;
use serde_json::Map;
use serde_json::Value;
@@ -235,10 +240,10 @@ pub async fn handle(
tls: &'static TlsConfig,
config: &'static HttpConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
) -> Result<Response<Body>, ApiError> {
) -> Result<Response<Full<Bytes>>, ApiError> {
let result = tokio::time::timeout(
config.request_timeout,
handle_inner(tls, config, ctx, request, sni_hostname, conn_pool),
@@ -347,10 +352,10 @@ async fn handle_inner(
tls: &'static TlsConfig,
config: &'static HttpConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
) -> anyhow::Result<Response<Body>> {
) -> anyhow::Result<Response<Full<Bytes>>> {
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&["http"])
.guard();
@@ -406,8 +411,8 @@ async fn handle_inner(
//
// Read the query and query params from the request body
//
let body = hyper::body::to_bytes(request.into_body()).await?;
let payload: Payload = serde_json::from_slice(&body)?;
let body = request.into_body().collect().await?.aggregate().reader();
let payload: Payload = serde_json::from_reader(body)?;
let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;
@@ -497,7 +502,6 @@ async fn handle_inner(
}
};
ctx.set_success();
ctx.log();
let metrics = client.metrics();
@@ -505,7 +509,7 @@ async fn handle_inner(
let body = serde_json::to_string(&result).expect("json serialization should not fail");
let len = body.len();
let response = response
.body(Body::from(body))
.body(Full::from(body))
// only fails if invalid status code or invalid header/values are given.
// these are not user configurable so it cannot fail dynamically
.expect("building response payload should not fail");

View File

@@ -235,18 +235,19 @@ async fn collect_metrics_iteration(
#[cfg(test)]
mod tests {
use std::{
net::TcpListener,
sync::{Arc, Mutex},
};
use std::sync::{Arc, Mutex};
use anyhow::Error;
use bytes::{Buf, Bytes};
use chrono::Utc;
use consumption_metrics::{Event, EventChunk};
use hyper::{
service::{make_service_fn, service_fn},
Body, Response,
use http_body_util::{BodyExt, Empty};
use hyper::{body::Incoming, service::service_fn, Response};
use hyper_util::{
rt::{TokioExecutor, TokioIo},
server::conn,
};
use tokio::net::TcpListener;
use url::Url;
use super::{collect_metrics_iteration, Ids, Metrics};
@@ -254,30 +255,43 @@ mod tests {
#[tokio::test]
async fn metrics() {
let listener = TcpListener::bind("0.0.0.0:0").unwrap();
let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let reports = Arc::new(Mutex::new(vec![]));
let reports2 = reports.clone();
let server = hyper::server::Server::from_tcp(listener)
.unwrap()
.serve(make_service_fn(move |_| {
let reports = reports.clone();
async move {
Ok::<_, Error>(service_fn(move |req| {
let reports = reports.clone();
async move {
let bytes = hyper::body::to_bytes(req.into_body()).await?;
let events: EventChunk<'static, Event<Ids, String>> =
serde_json::from_slice(&bytes)?;
reports.lock().unwrap().push(events);
Ok::<_, Error>(Response::new(Body::from(vec![])))
}
}))
}
}));
let addr = server.local_addr();
tokio::spawn(server);
let service = service_fn(move |req: hyper::Request<Incoming>| {
let reports = reports.clone();
async move {
let bytes = req
.into_body()
.collect()
.await
.unwrap()
.aggregate()
.reader();
let events: EventChunk<'static, Event<Ids, String>> =
serde_json::from_reader(bytes)?;
reports.lock().unwrap().push(events);
Ok::<_, Error>(Response::new(Empty::<Bytes>::new()))
}
});
tokio::spawn(async move {
loop {
let (stream, _) = listener.accept().await.unwrap();
let io = TokioIo::new(stream);
let service = service.clone();
tokio::task::spawn(async move {
let builder = conn::auto::Builder::new(TokioExecutor::new());
let res = builder.serve_connection(io, service).await;
if let Err(err) = res {
println!("Error serving connection: {:?}", err);
}
});
}
});
let metrics = Metrics::default();
let client = http::new_client(RateLimiterConfig::default());

View File

@@ -8,8 +8,6 @@ use futures::future::BoxFuture;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, StreamExt};
use remote_storage::RemoteStorageConfig;
use safekeeper::control_file::FileStorage;
use safekeeper::state::TimelinePersistentState;
use sd_notify::NotifyState;
use tokio::runtime::Handle;
use tokio::signal::unix::{signal, SignalKind};
@@ -32,12 +30,12 @@ use safekeeper::defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::wal_service;
use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf;
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
use safekeeper::{control_file, BROKER_RUNTIME};
use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{json_merge, wal_service};
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
@@ -107,6 +105,9 @@ struct Args {
/// Do not wait for changes to be written safely to disk. Unsafe.
#[arg(short, long)]
no_sync: bool,
/// Dump control file at path specified by this argument and exit.
#[arg(long)]
dump_control_file: Option<Utf8PathBuf>,
/// Broker endpoint for storage nodes coordination in the form
/// http[s]://host:port. In case of https schema TLS is connection is
/// established; plaintext otherwise.
@@ -165,21 +166,6 @@ struct Args {
/// useful for debugging.
#[arg(long)]
current_thread_runtime: bool,
/// Dump control file at path specified by this argument and exit.
#[arg(long)]
dump_control_file: Option<Utf8PathBuf>,
/// Patch control file at path specified by this argument and exit.
/// Patch is specified in --patch option and imposed over
/// control file as per rfc7386.
/// Without --write-patched the result is only printed.
#[arg(long, verbatim_doc_comment)]
patch_control_file: Option<Utf8PathBuf>,
/// The patch to apply to control file at --patch-control-file, in JSON.
#[arg(long, default_value = None)]
patch: Option<String>,
/// Write --patch-control-file result back in place.
#[arg(long, default_value = "false")]
write_patched: bool,
}
// Like PathBufValueParser, but allows empty string.
@@ -221,13 +207,7 @@ async fn main() -> anyhow::Result<()> {
if let Some(addr) = args.dump_control_file {
let state = control_file::FileStorage::load_control_file(addr)?;
let json = serde_json::to_string(&state)?;
println!("{json}");
return Ok(());
}
if let Some(cfile_path) = args.patch_control_file {
let patch = args.patch.ok_or(anyhow::anyhow!("patch is missing"))?;
patch_control_file(cfile_path, patch, args.write_patched).await?;
print!("{json}");
return Ok(());
}
@@ -549,26 +529,6 @@ fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfi
})
}
async fn patch_control_file(
cfile_path: Utf8PathBuf,
patch: String,
write: bool,
) -> anyhow::Result<()> {
let state = control_file::FileStorage::load_control_file(&cfile_path)?;
// serialize to json, impose patch and deserialize back
let mut state_json =
serde_json::to_value(state).context("failed to serialize state to json")?;
let patch_json = serde_json::from_str(&patch).context("failed to parse patch")?;
json_merge(&mut state_json, patch_json);
let patched_state: TimelinePersistentState =
serde_json::from_value(state_json.clone()).context("failed to deserialize patched json")?;
println!("{state_json}");
if write {
FileStorage::do_persist(&patched_state, &cfile_path, true).await?;
}
return Ok(());
}
#[test]
fn verify_cli() {
use clap::CommandFactory;

View File

@@ -2,7 +2,7 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8PathBuf;
use tokio::fs::{self, File};
use tokio::io::AsyncWriteExt;
@@ -155,46 +155,6 @@ impl FileStorage {
})?;
Ok(state)
}
/// Persist state s to dst_path, optionally fsyncing file.
pub async fn do_persist(
s: &TimelinePersistentState,
dst_path: &Utf8Path,
sync: bool,
) -> Result<()> {
let mut f = File::create(&dst_path)
.await
.with_context(|| format!("failed to create partial control file at: {}", &dst_path))?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
buf.extend_from_slice(&checksum.to_le_bytes());
f.write_all(&buf).await.with_context(|| {
format!(
"failed to write safekeeper state into control file at: {}",
dst_path
)
})?;
f.flush().await.with_context(|| {
format!(
"failed to flush safekeeper state into control file at: {}",
dst_path
)
})?;
// fsync the file
if sync {
f.sync_all()
.await
.with_context(|| format!("failed to sync partial control file at {}", dst_path))?;
}
Ok(())
}
}
impl Deref for FileStorage {
@@ -207,7 +167,7 @@ impl Deref for FileStorage {
#[async_trait::async_trait]
impl Storage for FileStorage {
/// Atomically persists state durably to the underlying storage.
/// Persists state durably to the underlying storage.
///
/// For a description, see <https://lwn.net/Articles/457667/>.
async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
@@ -215,9 +175,46 @@ impl Storage for FileStorage {
// write data to safekeeper.control.partial
let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
FileStorage::do_persist(s, &control_partial_path, !self.conf.no_sync).await?;
let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
format!(
"failed to create partial control file at: {}",
&control_partial_path
)
})?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
buf.extend_from_slice(&checksum.to_le_bytes());
control_partial.write_all(&buf).await.with_context(|| {
format!(
"failed to write safekeeper state into control file at: {}",
control_partial_path
)
})?;
control_partial.flush().await.with_context(|| {
format!(
"failed to flush safekeeper state into control file at: {}",
control_partial_path
)
})?;
// fsync the file
if !self.conf.no_sync {
control_partial.sync_all().await.with_context(|| {
format!(
"failed to sync partial control file at {}",
control_partial_path
)
})?;
}
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
// rename should be atomic
fs::rename(&control_partial_path, &control_path).await?;
// this sync is not required by any standard but postgres does this (see durable_rename)

View File

@@ -288,32 +288,34 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
}
/// Deactivates the timeline and removes its data directory.
async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn timeline_delete_force_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "timeline_id")?,
);
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(ttid.tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
// error handling here when we're able to.
let resp = GlobalTimelines::delete(&ttid, only_local)
let resp = GlobalTimelines::delete_force(&ttid)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, resp)
}
/// Deactivates all timelines for the tenant and removes its data directory.
/// See `timeline_delete_handler`.
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
/// See `timeline_delete_force_handler`.
async fn tenant_delete_force_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id = parse_request_param(&request, "tenant_id")?;
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
// Using an `InternalServerError` should be fixed when the types support it
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(
@@ -510,10 +512,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
request_span(r, timeline_status_handler)
})
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
request_span(r, timeline_delete_handler)
request_span(r, timeline_delete_force_handler)
})
.delete("/v1/tenant/:tenant_id", |r| {
request_span(r, tenant_delete_handler)
request_span(r, tenant_delete_force_handler)
})
.post("/v1/pull_timeline", |r| {
request_span(r, timeline_pull_handler)

View File

@@ -2,7 +2,6 @@
use camino::Utf8PathBuf;
use once_cell::sync::Lazy;
use remote_storage::RemoteStorageConfig;
use serde_json::Value;
use tokio::runtime::Runtime;
use std::time::Duration;
@@ -89,10 +88,6 @@ impl SafeKeeperConf {
self.tenant_dir(&ttid.tenant_id)
.join(ttid.timeline_id.to_string())
}
pub fn is_wal_backup_enabled(&self) -> bool {
self.remote_storage.is_some() && self.wal_backup_enabled
}
}
impl SafeKeeperConf {
@@ -176,24 +171,3 @@ pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.build()
.expect("Failed to create broker runtime")
});
/// Merge json b into json a according to
/// https://www.rfc-editor.org/rfc/rfc7396
/// https://stackoverflow.com/a/54118457/4014587
pub fn json_merge(a: &mut Value, b: Value) {
if let Value::Object(a) = a {
if let Value::Object(b) = b {
for (k, v) in b {
if v.is_null() {
a.remove(&k);
} else {
json_merge(a.entry(k).or_insert(Value::Null), v);
}
}
return;
}
}
*a = b;
}

View File

@@ -742,11 +742,6 @@ where
state.timeline_start_lsn
);
}
if state.peer_horizon_lsn == Lsn(0) {
// Update peer_horizon_lsn as soon as we know where timeline starts.
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
state.peer_horizon_lsn = msg.timeline_start_lsn;
}
if state.local_start_lsn == Lsn(0) {
state.local_start_lsn = msg.start_streaming_at;
info!("setting local_start_lsn to {:?}", state.local_start_lsn);

View File

@@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler {
self.conf.timeline_dir(&tli.ttid),
&persisted_state,
start_pos,
self.conf.is_wal_backup_enabled(),
self.conf.wal_backup_enabled,
)?;
// Split to concurrently receive and send data; replies are generally

View File

@@ -33,13 +33,12 @@ use crate::safekeeper::{
};
use crate::send_wal::WalSenders;
use crate::state::{TimelineMemState, TimelinePersistentState};
use crate::wal_backup::{self};
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
use crate::metrics::FullTimelineInfo;
use crate::wal_storage::Storage as wal_storage_iface;
use crate::SafeKeeperConf;
use crate::{debug_dump, wal_storage};
use crate::{GlobalTimelines, SafeKeeperConf};
/// Things safekeeper should know about timeline state on peers.
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -472,29 +471,14 @@ impl Timeline {
}
}
/// Delete timeline from disk completely, by removing timeline directory.
/// Background timeline activities will stop eventually.
///
/// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
/// deletion API endpoint is retriable.
pub async fn delete(
/// Delete timeline from disk completely, by removing timeline directory. Background
/// timeline activities will stop eventually.
pub async fn delete_from_disk(
&self,
shared_state: &mut MutexGuard<'_, SharedState>,
only_local: bool,
) -> Result<(bool, bool)> {
let was_active = shared_state.active;
self.cancel(shared_state);
// TODO: It's better to wait for s3 offloader termination before
// removing data from s3. Though since s3 doesn't have transactions it
// still wouldn't guarantee absense of data after removal.
let conf = GlobalTimelines::get_global_config();
if !only_local && conf.is_wal_backup_enabled() {
// Note: we concurrently delete remote storage data from multiple
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
// do some retries anyway.
wal_backup::delete_timeline(&self.ttid).await?;
}
let dir_existed = delete_dir(&self.timeline_dir).await?;
Ok((dir_existed, was_active))
}

View File

@@ -327,20 +327,16 @@ impl GlobalTimelines {
}
/// Cancels timeline, then deletes the corresponding data directory.
/// If only_local, doesn't remove WAL segments in remote storage.
pub async fn delete(
ttid: &TenantTimelineId,
only_local: bool,
) -> Result<TimelineDeleteForceResult> {
pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
match tli_res {
Ok(timeline) => {
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state().await;
info!("deleting timeline {}, only_local={}", ttid, only_local);
info!("deleting timeline {}", ttid);
let (dir_existed, was_active) =
timeline.delete(&mut shared_state, only_local).await?;
timeline.delete_from_disk(&mut shared_state).await?;
// Remove timeline from the map.
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -373,11 +369,8 @@ impl GlobalTimelines {
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
/// created simultaneously. In that case the function will return error and the caller should
/// retry tenant deletion again later.
///
/// If only_local, doesn't remove WAL segments in remote storage.
pub async fn delete_force_all_for_tenant(
tenant_id: &TenantId,
only_local: bool,
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
let to_delete = Self::get_all_for_tenant(*tenant_id);
@@ -386,7 +379,7 @@ impl GlobalTimelines {
let mut deleted = HashMap::new();
for tli in &to_delete {
match Self::delete(&tli.ttid, only_local).await {
match Self::delete_force(&tli.ttid).await {
Ok(result) => {
deleted.insert(tli.ttid, result);
}

View File

@@ -4,8 +4,6 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::stream::FuturesOrdered;
use futures::StreamExt;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use utils::id::NodeId;
use std::cmp::min;
@@ -168,17 +166,6 @@ async fn update_task(
}
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
// Storage must be configured and initialized when this is called.
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
.as_ref()
.unwrap()
}
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -212,7 +199,7 @@ pub async fn wal_backup_launcher_task_main(
ttid = wal_backup_launcher_rx.recv() => {
// channel is never expected to get closed
let ttid = ttid.unwrap();
if !conf.is_wal_backup_enabled() {
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
continue; /* just drain the channel and do nothing */
}
async {
@@ -497,12 +484,18 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
res
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
async fn backup_object(
source_file: &Utf8Path,
target_file: &RemotePath,
size: usize,
) -> Result<()> {
let storage = get_configured_remote_storage();
let storage = REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
.as_ref()
.unwrap();
let file = File::open(&source_file)
.await
@@ -539,39 +532,6 @@ pub async fn read_object(
Ok(Box::pin(reader))
}
/// Delete WAL files for the given timeline. Remote storage must be configured
/// when called.
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
let storage = get_configured_remote_storage();
let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
let remote_path = RemotePath::new(&ttid_path)?;
// A backoff::retry is used here for two reasons:
// - To provide a backoff rather than busy-polling the API on errors
// - To absorb transient 429/503 conditions without hitting our error
// logging path for issues deleting objects.
//
// Note: listing segments might take a long time if there are many of them.
// We don't currently have http requests timeout cancellation, but if/once
// we have listing should get streaming interface to make progress.
let token = CancellationToken::new(); // not really used
backoff::retry(
|| async {
let files = storage.list_files(Some(&remote_path)).await?;
storage.delete_objects(&files).await?;
Ok(())
},
|_| false,
3,
10,
"executing WAL segments deletion batch",
backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
)
.await?;
Ok(())
}
/// Copy segments from one timeline to another. Used in copy_timeline.
pub async fn copy_s3_segments(
wal_seg_size: usize,

View File

@@ -18,7 +18,11 @@ futures-core.workspace = true
futures-util.workspace = true
git-version.workspace = true
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }
hyper = { workspace = true, features = ["server"] }
hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
http = { workspace = true, features = [] }
http-body = { workspace = true, features = [] }
http-body-util = { workspace = true, features = [] }
once_cell.workspace = true
parking_lot.workspace = true
prost.workspace = true
@@ -29,6 +33,9 @@ tracing.workspace = true
metrics.workspace = true
utils.workspace = true
# needed for tonic
http0_2 = { package = "http", version = "0.2" }
workspace_hack.workspace = true
[build-dependencies]

View File

@@ -13,10 +13,14 @@
use clap::{command, Parser};
use futures_core::Stream;
use futures_util::StreamExt;
use http::Request;
use hyper::body::Incoming;
use hyper::header::CONTENT_TYPE;
use hyper::server::conn::AddrStream;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, StatusCode};
use hyper_util::rt::{TokioExecutor, TokioIo};
use hyper_util::server::conn;
use parking_lot::RwLock;
use std::collections::HashMap;
use std::convert::Infallible;
@@ -24,6 +28,7 @@ use std::net::SocketAddr;
use std::pin::Pin;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::TcpListener;
use tokio::sync::broadcast;
use tokio::sync::broadcast::error::RecvError;
use tokio::time;
@@ -596,9 +601,7 @@ impl BrokerService for Broker {
}
// We serve only metrics and healthcheck through http1.
async fn http1_handler(
req: hyper::Request<hyper::body::Body>,
) -> Result<hyper::Response<Body>, Infallible> {
async fn http1_handler(req: hyper::Request<Body>) -> Result<hyper::Response<Body>, Infallible> {
let resp = match (req.method(), req.uri().path()) {
(&Method::GET, "/metrics") => {
let mut buffer = vec![];
@@ -662,16 +665,19 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let storage_broker_server = BrokerServiceServer::new(storage_broker_impl);
info!("listening on {}", &args.listen_addr);
let listener = TcpListener::bind(args.listen_addr).await?;
// grpc is served along with http1 for metrics on a single port, hence we
// don't use tonic's Server.
hyper::Server::bind(&args.listen_addr)
.http2_keep_alive_interval(Some(args.http2_keepalive_interval))
.serve(make_service_fn(move |conn: &AddrStream| {
loop {
let (stream, remote_addr) = listener.accept().await?;
let io = TokioIo::new(stream);
tokio::task::spawn(async move {
let storage_broker_server_cloned = storage_broker_server.clone();
let connect_info = conn.connect_info();
async move {
Ok::<_, Infallible>(service_fn(move |mut req| {
let service = async move {
Ok::<_, Infallible>(service_fn(move |mut req: Request<Incoming>| {
// That's what tonic's MakeSvc.call does to pass conninfo to
// the request handler (and where its request.remote_addr()
// expects it to find).
@@ -690,6 +696,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
if req.headers().get("content-type").map(|x| x.as_bytes())
== Some(b"application/grpc")
{
// TODO: this doesn't work :(
let (parts, body) = req.into_parts();
let req = http0_2::Request::from_parts(parts, body);
let res_resp = storage_broker_server_svc.call(req).await;
// Grpc and http1 handlers have slightly different
// Response types: it is UnsyncBoxBody for the
@@ -703,10 +712,17 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
}
}
}))
};
let builder = conn::auto::Builder::new(TokioExecutor::new())
.http2()
.keep_alive_interval(Some(args.http2_keepalive_interval));
if let Err(err) = builder.serve_connection(io, service).await {
tracing::error!("Error serving connection: {:?}", err);
}
}))
.await?;
Ok(())
});
}
}
#[cfg(test)]

View File

@@ -1,8 +1,6 @@
use hyper::body::HttpBody;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::time::Duration;
use tonic::codegen::StdError;
pub use tonic::transport::Uri;
use tonic::transport::{ClientTlsConfig, Endpoint};
use tonic::{transport::Channel, Status};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -27,8 +25,6 @@ pub use tonic::Code;
pub use tonic::Request;
pub use tonic::Streaming;
pub use hyper::Uri;
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
@@ -99,50 +95,7 @@ pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTime
// well.
type AnyError = Box<dyn std::error::Error + Send + Sync + 'static>;
// Provides impl HttpBody for two different types implementing it. Inspired by
// https://github.com/hyperium/tonic/blob/master/examples/src/hyper_warp/server.rs
pub enum EitherBody<A, B> {
Left(A),
Right(B),
}
impl<A, B> HttpBody for EitherBody<A, B>
where
A: HttpBody + Send + Unpin,
B: HttpBody<Data = A::Data> + Send + Unpin,
A::Error: Into<AnyError>,
B::Error: Into<AnyError>,
{
type Data = A::Data;
type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
fn is_end_stream(&self) -> bool {
match self {
EitherBody::Left(b) => b.is_end_stream(),
EitherBody::Right(b) => b.is_end_stream(),
}
}
fn poll_data(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Data, Self::Error>>> {
match self.get_mut() {
EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err),
EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err),
}
}
fn poll_trailers(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Result<Option<hyper::HeaderMap>, Self::Error>> {
match self.get_mut() {
EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
}
}
}
pub type EitherBody<L, R> = http_body_util::Either<L, R>;
fn map_option_err<T, U: Into<AnyError>>(err: Option<Result<T, U>>) -> Option<Result<T, AnyError>> {
err.map(|e| e.map_err(Into::into))

View File

@@ -12,11 +12,9 @@ from pathlib import Path
# Type-related stuff
from typing import Callable, ClassVar, Dict, Iterator, Optional
import allure
import pytest
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
from _pytest.terminal import TerminalReporter
from fixtures.log_helper import log
@@ -413,10 +411,7 @@ class NeonBenchmarker:
@pytest.fixture(scope="function")
def zenbenchmark(
request: FixtureRequest,
record_property: Callable[[str, object], None],
) -> Iterator[NeonBenchmarker]:
def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
"""
This is a python decorator for benchmark fixtures. It contains functions for
recording measurements, and prints them out at the end.
@@ -424,21 +419,6 @@ def zenbenchmark(
benchmarker = NeonBenchmarker(record_property)
yield benchmarker
results = {}
for _, recorded_property in request.node.user_properties:
name = recorded_property["name"]
value = str(recorded_property["value"])
if (unit := recorded_property["unit"].strip()) != "":
value += f" {unit}"
results[name] = value
content = json.dumps(results, indent=2)
allure.attach(
content,
"benchmarks.json",
allure.attachment_type.JSON,
)
def pytest_addoption(parser: Parser):
parser.addoption(

View File

@@ -3352,15 +3352,9 @@ class SafekeeperHttpClient(requests.Session):
)
res.raise_for_status()
# only_local doesn't remove segments in the remote storage.
def timeline_delete(
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
) -> Dict[Any, Any]:
def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]:
res = self.delete(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
params={
"only_local": str(only_local).lower(),
},
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
)
res.raise_for_status()
res_json = res.json()

View File

@@ -1,11 +1,11 @@
import time
from typing import Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
from fixtures.remote_storage import RemoteStorageKind, S3Storage
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.utils import wait_until
@@ -233,18 +233,23 @@ def timeline_delete_wait_completed(
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
# remote_storage must not be None, but that's easier for callers to make mypy happy
if TYPE_CHECKING:
# TODO avoid by combining remote storage related stuff in single type
# and just passing in this type instead of whole builder
from fixtures.neon_fixtures import NeonEnvBuilder
def assert_prefix_empty(
remote_storage: Optional[RemoteStorage],
neon_env_builder: "NeonEnvBuilder",
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
response = list_prefix(neon_env_builder, prefix)
keys = response["KeyCount"]
objects: List[ObjectTypeDef] = response.get("Contents", [])
common_prefixes = response.get("CommonPrefixes", [])
remote_storage = neon_env_builder.pageserver_remote_storage
is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
if is_mock_s3:
@@ -278,20 +283,19 @@ def assert_prefix_empty(
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
# remote_storage must not be None, but that's easier for callers to make mypy happy
def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
response = list_prefix(neon_env_builder, prefix)
assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
def list_prefix(
remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/"
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
) -> ListObjectsV2OutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
"""
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
remote = neon_env_builder.pageserver_remote_storage
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None

View File

@@ -216,14 +216,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
log.info(f"group: {m.group(1)}")
return int(m.group(1), 16)
assert neon_env_builder.pageserver_remote_storage is not None
pre_upgrade_keys = list(
[
o["Key"]
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
"Contents"
]
]
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in pre_upgrade_keys:
assert parse_generation_suffix(key) is None
@@ -238,12 +232,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
legacy_objects: list[str] = []
suffixed_objects = []
post_upgrade_keys = list(
[
o["Key"]
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
"Contents"
]
]
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in post_upgrade_keys:
log.info(f"post-upgrade key: {key}")

View File

@@ -504,7 +504,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -75,7 +75,7 @@ def test_tenant_delete_smoke(
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -96,7 +96,7 @@ def test_tenant_delete_smoke(
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -207,7 +207,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -268,7 +268,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# Check remote is empty
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -304,7 +304,7 @@ def test_tenant_delete_is_resumed_on_attach(
# sanity check, data should be there
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -343,7 +343,7 @@ def test_tenant_delete_is_resumed_on_attach(
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -378,7 +378,7 @@ def test_tenant_delete_is_resumed_on_attach(
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -543,7 +543,7 @@ def test_tenant_delete_concurrent(
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -645,7 +645,7 @@ def test_tenant_delete_races_timeline_creation(
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -191,7 +191,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -275,7 +275,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
# Check remote is empty
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
assert all([tl["state"] == "Active" for tl in timelines])
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -466,7 +466,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -482,7 +482,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
wait_until(
2,
0.5,
lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
lambda: assert_prefix_empty(neon_env_builder),
)
@@ -673,7 +673,7 @@ def test_timeline_delete_works_for_remote_smoke(
for timeline_id in timeline_ids:
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -690,7 +690,7 @@ def test_timeline_delete_works_for_remote_smoke(
timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=timeline_id)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -703,7 +703,7 @@ def test_timeline_delete_works_for_remote_smoke(
# for some reason the check above doesnt immediately take effect for the below.
# Assume it is mock server inconsistency and check twice.
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
def test_delete_orphaned_objects(
@@ -791,7 +791,7 @@ def test_timeline_delete_resumed_on_attach(
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -839,7 +839,7 @@ def test_timeline_delete_resumed_on_attach(
assert reason.endswith(f"failpoint: {failpoint}"), reason
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -870,7 +870,7 @@ def test_timeline_delete_resumed_on_attach(
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -33,19 +33,13 @@ from fixtures.neon_fixtures import (
last_flush_lsn_upload,
)
from fixtures.pageserver.utils import (
assert_prefix_empty,
assert_prefix_not_empty,
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pg_version import PgVersion
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import (
RemoteStorageKind,
default_remote_storage,
s3_storage,
)
from fixtures.remote_storage import RemoteStorageKind, default_remote_storage
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import get_dir_size, query_scalar, start_in_background
@@ -124,8 +118,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
with env.pageserver.http_client() as pageserver_http:
timeline_details = [
pageserver_http.timeline_detail(
tenant_id=tenant_id,
timeline_id=branch_names_to_timeline_ids[branch_name],
tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]
)
for branch_name in branch_names
]
@@ -464,19 +457,10 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId,
def test_wal_backup(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
remote_storage_kind = s3_storage()
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
env = neon_env_builder.init_start()
# These are expected after timeline deletion on safekeepers.
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found in global map.*",
".*Timeline .* was cancelled and cannot be used anymore.*",
]
)
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup")
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
@@ -504,8 +488,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
# put one of safekeepers down again
env.safekeepers[0].stop()
# restart postgres
endpoint.stop()
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup")
# and ensure offloading still works
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
@@ -515,17 +498,6 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end),
f"segment ending at {seg_end} get offloaded",
)
env.safekeepers[0].start()
endpoint.stop()
# Test that after timeline deletion remote objects are gone.
prefix = "/".join([str(tenant_id), str(timeline_id)])
assert_prefix_not_empty(neon_env_builder.safekeepers_remote_storage, prefix)
for sk in env.safekeepers:
sk_http = sk.http_client()
sk_http.timeline_delete(tenant_id, timeline_id)
assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
@@ -614,7 +586,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
# advancing peer_horizon_lsn.
for sk in env.safekeepers:
cli = sk.http_client()
cli.timeline_delete(tenant_id, timeline_id, only_local=True)
cli.timeline_delete_force(tenant_id, timeline_id)
# restart safekeeper to clear its in-memory state
sk.stop()
# wait all potenital in flight pushes to broker arrive before starting
@@ -1651,7 +1623,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
endpoint_3.stop_and_destroy()
# Remove initial tenant's br1 (active)
assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1659,7 +1631,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Ensure repeated deletion succeeds
assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1670,13 +1642,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
# Ensure we cannot delete the other tenant
for sk_h in [sk_http, sk_http_noauth]:
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other)
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
assert sk_h.tenant_delete_force(tenant_id_other)
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Remove initial tenant's br2 (inactive)
assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1684,7 +1656,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Remove non-existing branch, should succeed
assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()

View File

@@ -6,7 +6,7 @@ commands:
sysvInitAction: sysinit
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
- name: pgbouncer
user: postgres
user: nobody
sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
- name: postgres-exporter
@@ -36,9 +36,7 @@ files:
max_client_conn=10000
default_pool_size=64
max_prepared_statements=0
admin_users=postgres
unix_socket_dir=/tmp/
unix_socket_mode=0777
admin_users=cloud_admin
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -200,7 +198,7 @@ merge: |
RUN set -e \
&& chown postgres:postgres /etc/pgbouncer.ini \
&& chmod 0666 /etc/pgbouncer.ini \
&& chmod 0644 /etc/pgbouncer.ini \
&& chmod 0644 /etc/cgconfig.conf \
&& chmod 0644 /etc/sql_exporter.yml \
&& chmod 0644 /etc/neon_collector.yml