mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 05:30:37 +00:00
Compare commits
36 Commits
erik/oltp-
...
ci-run/pr-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7cdb292b37 | ||
|
|
a55e0192dc | ||
|
|
fa3ceab30e | ||
|
|
7710c18761 | ||
|
|
1e14d784f4 | ||
|
|
2574bbe072 | ||
|
|
d74c715602 | ||
|
|
69cfd1f7e0 | ||
|
|
5dea3e2195 | ||
|
|
a04e33ceb6 | ||
|
|
af0be11503 | ||
|
|
405a17bf0b | ||
|
|
63ee8e2181 | ||
|
|
2c21a65b0b | ||
|
|
ec66b788e2 | ||
|
|
af12647b9d | ||
|
|
1c237d0c6d | ||
|
|
afd34291ca | ||
|
|
66f80e77ba | ||
|
|
72832b3214 | ||
|
|
d11f23a341 | ||
|
|
e7502a3d63 | ||
|
|
ef8101a9be | ||
|
|
d2825e72ad | ||
|
|
a6ff8ec3d4 | ||
|
|
cf62017a5b | ||
|
|
c610f3584d | ||
|
|
c9ca8b7c4a | ||
|
|
7679b63a2c | ||
|
|
d177654e5f | ||
|
|
a09c933de3 | ||
|
|
6138d61592 | ||
|
|
a7142f3bc6 | ||
|
|
7791a49dd4 | ||
|
|
8a6d0dccaa | ||
|
|
7ffcbfde9a |
@@ -19,6 +19,7 @@
|
||||
!pageserver/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!object_storage/
|
||||
!storage_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
/artifact_cache
|
||||
/pg_install
|
||||
/target
|
||||
/tmp_check
|
||||
|
||||
55
Cargo.lock
generated
55
Cargo.lock
generated
@@ -3991,6 +3991,33 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"futures",
|
||||
"http-body-util",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"test-log",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower 0.5.2",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.20.2"
|
||||
@@ -4693,7 +4720,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.6"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
@@ -4727,7 +4754,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.6"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -6925,6 +6952,28 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-log"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"test-log-macros",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-log-macros"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
@@ -7172,7 +7221,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
|
||||
@@ -40,6 +40,7 @@ members = [
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"object_storage",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28"
|
||||
tracing-serde = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
test-log = { version = "0.2.17", default-features = false, features = ["log"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
|
||||
@@ -89,6 +89,7 @@ RUN set -e \
|
||||
--bin storage_broker \
|
||||
--bin storage_controller \
|
||||
--bin proxy \
|
||||
--bin object_storage \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release
|
||||
@@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
|
||||
|
||||
@@ -118,16 +118,18 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
pub set_disk_quota_for_fs: Option<String>,
|
||||
|
||||
#[arg(short = 's', long = "spec", group = "spec")]
|
||||
pub spec_json: Option<String>,
|
||||
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id")]
|
||||
pub compute_id: String,
|
||||
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
#[arg(
|
||||
short = 'p',
|
||||
long,
|
||||
conflicts_with = "spec-path",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL"
|
||||
)]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
|
||||
@@ -172,7 +174,6 @@ fn main() -> Result<()> {
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
live_config_allowed: cli_spec.live_config_allowed,
|
||||
},
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
@@ -201,23 +202,12 @@ async fn init() -> Result<()> {
|
||||
}
|
||||
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
if let Some(ref spec_json) = cli.spec_json {
|
||||
info!("got spec from cli argument {}", spec_json);
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_str(spec_json)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: false,
|
||||
});
|
||||
}
|
||||
|
||||
// Second, try to read it from the file if path is provided
|
||||
// First, read spec from the path if provided
|
||||
if let Some(ref spec_path) = cli.spec_path {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: true,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -225,11 +215,12 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
// If the spec wasn't provided in the CLI arguments, then retrieve it from
|
||||
// the control plane
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(resp) => Ok(CliSpecParams {
|
||||
spec: resp.0,
|
||||
compute_ctl_config: resp.1,
|
||||
live_config_allowed: true,
|
||||
}),
|
||||
Err(e) => {
|
||||
error!(
|
||||
@@ -247,7 +238,6 @@ struct CliSpecParams {
|
||||
spec: Option<ComputeSpec>,
|
||||
#[allow(dead_code)]
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
|
||||
@@ -98,13 +98,15 @@ pub async fn get_database_schema(
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||
})?;
|
||||
let stdout = cmd
|
||||
.stdout
|
||||
.take()
|
||||
.ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;
|
||||
|
||||
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||
})?;
|
||||
let stderr = cmd
|
||||
.stderr
|
||||
.take()
|
||||
.ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;
|
||||
|
||||
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
@@ -128,8 +130,7 @@ pub async fn get_database_schema(
|
||||
}
|
||||
});
|
||||
|
||||
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
return Err(SchemaDumpError::IO(std::io::Error::other(
|
||||
"failed to start pg_dump",
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -93,20 +93,6 @@ pub struct ComputeNodeParams {
|
||||
|
||||
/// the address of extension storage proxy gateway
|
||||
pub ext_remote_storage: Option<String>,
|
||||
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
/// - we start compute with some spec provided as argument
|
||||
/// - we push new spec and it does reconfiguration
|
||||
/// - but then something happens and compute pod / VM is destroyed,
|
||||
/// so k8s controller starts it again with the **old** spec
|
||||
///
|
||||
/// and the same for empty computes:
|
||||
/// - we started compute without any spec
|
||||
/// - we push spec and it does configuration
|
||||
/// - but then it is restarted without any spec again
|
||||
pub live_config_allowed: bool,
|
||||
}
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -661,15 +647,8 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for Postgres logs export
|
||||
if self.has_feature(ComputeFeature::PostgresLogsExport) {
|
||||
if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
|
||||
let host = PostgresLogsRsyslogConfig::default_host(project_id);
|
||||
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
|
||||
configure_postgres_logs_export(conf)?;
|
||||
} else {
|
||||
warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
|
||||
}
|
||||
}
|
||||
let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
|
||||
configure_postgres_logs_export(conf)?;
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(self);
|
||||
@@ -1573,6 +1552,10 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Reconfigure rsyslog for Postgres logs export
|
||||
let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
|
||||
configure_postgres_logs_export(conf)?;
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
config::write_postgres_conf(
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use compute_api::responses::TlsConfig;
|
||||
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
@@ -255,7 +255,7 @@ pub fn write_postgres_conf(
|
||||
|
||||
// We need Postgres to send logs to rsyslog so that we can forward them
|
||||
// further to customers' log aggregation systems.
|
||||
if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
|
||||
if spec.logs_export_host.is_some() {
|
||||
writeln!(file, "log_destination='stderr,syslog'")?;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,20 +6,15 @@ use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
};
|
||||
use compute_api::requests::ComputeClaims;
|
||||
use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use serde::Deserialize;
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub(in crate::http) struct Claims {
|
||||
compute_id: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
compute_id: String,
|
||||
@@ -112,7 +107,11 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
|
||||
impl Authorize {
|
||||
/// Verify the token using the JSON Web Key set and return the token data.
|
||||
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
|
||||
fn verify(
|
||||
jwks: &JwkSet,
|
||||
token: &str,
|
||||
validation: &Validation,
|
||||
) -> Result<TokenData<ComputeClaims>> {
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
@@ -127,7 +126,7 @@ impl Authorize {
|
||||
}
|
||||
};
|
||||
|
||||
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
|
||||
match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
|
||||
@@ -306,36 +306,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/configure_telemetry:
|
||||
post:
|
||||
tags:
|
||||
- Configure
|
||||
summary: Configure rsyslog
|
||||
description: |
|
||||
This API endpoint configures rsyslog to forward Postgres logs
|
||||
to a specified otel collector.
|
||||
operationId: configureTelemetry
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
logs_export_host:
|
||||
type: string
|
||||
description: |
|
||||
Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
|
||||
Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
|
||||
responses:
|
||||
204:
|
||||
description: "Telemetry configured successfully"
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::body::Body;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::ComputeFeature;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
@@ -13,7 +11,6 @@ use tracing::info;
|
||||
use crate::compute::{ComputeNode, ParsedSpec};
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If anything
|
||||
// goes wrong after we set the compute status to `ConfigurationPending` and
|
||||
@@ -25,13 +22,6 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
if !compute.params.live_config_allowed {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
@@ -95,25 +85,3 @@ pub(in crate::http) async fn configure(
|
||||
|
||||
JsonResponse::success(StatusCode::OK, body)
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn configure_telemetry(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigureTelemetryRequest>,
|
||||
) -> Response {
|
||||
if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"Postgres logs export feature is not enabled".to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
|
||||
if let Err(err) = configure_postgres_logs_export(conf) {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
|
||||
}
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::from(""))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -87,7 +87,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/configure_telemetry", post(configure::configure_telemetry))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
|
||||
@@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
|
||||
};
|
||||
Ok(config_content)
|
||||
}
|
||||
|
||||
/// Returns the default host for otel collector that receives Postgres logs
|
||||
pub fn default_host(project_id: &str) -> String {
|
||||
format!(
|
||||
"config-{}-collector.neon-telemetry.svc.cluster.local:10514",
|
||||
project_id
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
|
||||
pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
|
||||
let new_config = conf.build()?;
|
||||
let current_config = PostgresLogsRsyslogConfig::current_config()?;
|
||||
@@ -261,16 +254,5 @@ mod tests {
|
||||
let res = conf.build();
|
||||
assert!(res.is_err());
|
||||
}
|
||||
|
||||
{
|
||||
// Verify config with default host
|
||||
let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
|
||||
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
|
||||
let res = conf.build();
|
||||
assert!(res.is_ok());
|
||||
let conf_str = res.unwrap();
|
||||
assert!(conf_str.contains(r#"shy-breeze-123"#));
|
||||
assert!(conf_str.contains(r#"port="10514""#));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{
|
||||
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
|
||||
SafekeeperConf,
|
||||
ObjectStorageConf, SafekeeperConf,
|
||||
};
|
||||
use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
|
||||
use control_plane::object_storage::ObjectStorage;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::{
|
||||
@@ -39,7 +41,7 @@ use pageserver_api::controller_api::{
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
@@ -91,6 +93,8 @@ enum NeonLocalCmd {
|
||||
#[command(subcommand)]
|
||||
Safekeeper(SafekeeperCmd),
|
||||
#[command(subcommand)]
|
||||
ObjectStorage(ObjectStorageCmd),
|
||||
#[command(subcommand)]
|
||||
Endpoint(EndpointCmd),
|
||||
#[command(subcommand)]
|
||||
Mappings(MappingsCmd),
|
||||
@@ -454,6 +458,32 @@ enum SafekeeperCmd {
|
||||
Restart(SafekeeperRestartCmdArgs),
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage object storage")]
|
||||
enum ObjectStorageCmd {
|
||||
Start(ObjectStorageStartCmd),
|
||||
Stop(ObjectStorageStopCmd),
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start object storage")]
|
||||
struct ObjectStorageStartCmd {
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop object storage")]
|
||||
struct ObjectStorageStopCmd {
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start local safekeeper")]
|
||||
struct SafekeeperStartCmdArgs {
|
||||
@@ -759,6 +789,7 @@ fn main() -> Result<()> {
|
||||
}
|
||||
NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
|
||||
NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
|
||||
NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
|
||||
NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
|
||||
NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
|
||||
};
|
||||
@@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
object_storage: ObjectStorageConf {
|
||||
port: OBJECT_STORAGE_DEFAULT_PORT,
|
||||
},
|
||||
pg_distrib_dir: None,
|
||||
neon_distrib_dir: None,
|
||||
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||
@@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
|
||||
stripe_size: args
|
||||
.shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
.unwrap_or(DEFAULT_STRIPE_SIZE),
|
||||
},
|
||||
placement_policy: args.placement_policy.clone(),
|
||||
config: tenant_conf,
|
||||
@@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
@@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
|
||||
use ObjectStorageCmd::*;
|
||||
let storage = ObjectStorage::from_env(env);
|
||||
|
||||
// In tests like test_forward_compatibility or test_graceful_cluster_restart
|
||||
// old neon binaries (without object_storage) are present
|
||||
if !storage.bin.exists() {
|
||||
eprintln!(
|
||||
"{} binary not found. Ignore if this is a compatibility test",
|
||||
storage.bin
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match subcmd {
|
||||
Start(ObjectStorageStartCmd { start_timeout }) => {
|
||||
if let Err(e) = storage.start(start_timeout).await {
|
||||
eprintln!("object_storage start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Stop(ObjectStorageStopCmd { stop_mode }) => {
|
||||
let immediate = match stop_mode {
|
||||
StopMode::Fast => false,
|
||||
StopMode::Immediate => true,
|
||||
};
|
||||
if let Err(e) = storage.stop(immediate) {
|
||||
eprintln!("proxy stop failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match subcmd {
|
||||
StorageBrokerCmd::Start(args) => {
|
||||
@@ -1777,6 +1846,13 @@ async fn handle_start_all_impl(
|
||||
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
|
||||
});
|
||||
}
|
||||
|
||||
js.spawn(async move {
|
||||
ObjectStorage::from_env(env)
|
||||
.start(&retry_timeout)
|
||||
.await
|
||||
.map_err(|e| e.context("start object_storage"))
|
||||
});
|
||||
})();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
@@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
}
|
||||
|
||||
let storage = ObjectStorage::from_env(env);
|
||||
if let Err(e) = storage.stop(immediate) {
|
||||
eprintln!("object_storage stop failed: {:#}", e);
|
||||
}
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.stop(immediate) {
|
||||
|
||||
@@ -670,6 +670,7 @@ impl Endpoint {
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
|
||||
@@ -10,6 +10,7 @@ mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
pub mod local_env;
|
||||
pub mod object_storage;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -15,9 +15,10 @@ use clap::ValueEnum;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::auth::{Claims, encode_from_key_file};
|
||||
use utils::auth::encode_from_key_file;
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
|
||||
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -55,6 +56,7 @@ pub struct LocalEnv {
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
pub private_key_path: PathBuf,
|
||||
pub public_key_path: PathBuf,
|
||||
|
||||
pub broker: NeonBroker,
|
||||
|
||||
@@ -68,6 +70,8 @@ pub struct LocalEnv {
|
||||
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
|
||||
pub object_storage: ObjectStorageConf,
|
||||
|
||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||
// be propagated into each pageserver's configuration.
|
||||
pub control_plane_api: Url,
|
||||
@@ -95,6 +99,7 @@ pub struct OnDiskConfig {
|
||||
pub neon_distrib_dir: PathBuf,
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
pub private_key_path: PathBuf,
|
||||
pub public_key_path: PathBuf,
|
||||
pub broker: NeonBroker,
|
||||
pub storage_controller: NeonStorageControllerConf,
|
||||
#[serde(
|
||||
@@ -103,6 +108,7 @@ pub struct OnDiskConfig {
|
||||
)]
|
||||
pub pageservers: Vec<PageServerConf>,
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub object_storage: ObjectStorageConf,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_hooks_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
@@ -136,11 +142,18 @@ pub struct NeonLocalInitConf {
|
||||
pub storage_controller: Option<NeonStorageControllerConf>,
|
||||
pub pageservers: Vec<NeonLocalInitPageserverConf>,
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub object_storage: ObjectStorageConf,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_hooks_api: Option<Url>,
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct ObjectStorageConf {
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
@@ -398,6 +411,10 @@ impl LocalEnv {
|
||||
self.pg_dir(pg_version, "lib")
|
||||
}
|
||||
|
||||
pub fn object_storage_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("object_storage")
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
@@ -431,6 +448,10 @@ impl LocalEnv {
|
||||
self.base_data_dir.join("safekeepers").join(data_dir_name)
|
||||
}
|
||||
|
||||
pub fn object_storage_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.join("object_storage")
|
||||
}
|
||||
|
||||
pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
|
||||
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
|
||||
Ok(conf)
|
||||
@@ -582,6 +603,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id,
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller,
|
||||
pageservers,
|
||||
@@ -591,6 +613,7 @@ impl LocalEnv {
|
||||
control_plane_compute_hook_api: _,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
base_data_dir: repopath.to_owned(),
|
||||
@@ -598,6 +621,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id,
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller,
|
||||
pageservers,
|
||||
@@ -606,6 +630,7 @@ impl LocalEnv {
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -705,6 +730,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir: self.neon_distrib_dir.clone(),
|
||||
default_tenant_id: self.default_tenant_id,
|
||||
private_key_path: self.private_key_path.clone(),
|
||||
public_key_path: self.public_key_path.clone(),
|
||||
broker: self.broker.clone(),
|
||||
storage_controller: self.storage_controller.clone(),
|
||||
pageservers: vec![], // it's skip_serializing anyway
|
||||
@@ -714,6 +740,7 @@ impl LocalEnv {
|
||||
control_plane_compute_hook_api: None,
|
||||
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||
generate_local_ssl_certs: self.generate_local_ssl_certs,
|
||||
object_storage: self.object_storage.clone(),
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -730,7 +757,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
|
||||
pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
@@ -797,6 +824,7 @@ impl LocalEnv {
|
||||
control_plane_api,
|
||||
generate_local_ssl_certs,
|
||||
control_plane_hooks_api,
|
||||
object_storage,
|
||||
} = conf;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -828,6 +856,7 @@ impl LocalEnv {
|
||||
)
|
||||
.context("generate auth keys")?;
|
||||
let private_key_path = PathBuf::from("auth_private_key.pem");
|
||||
let public_key_path = PathBuf::from("auth_public_key.pem");
|
||||
|
||||
// create the runtime type because the remaining initialization code below needs
|
||||
// a LocalEnv instance op operation
|
||||
@@ -838,6 +867,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id: Some(default_tenant_id),
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller: storage_controller.unwrap_or_default(),
|
||||
pageservers: pageservers.iter().map(Into::into).collect(),
|
||||
@@ -846,6 +876,7 @@ impl LocalEnv {
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings: Default::default(),
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
};
|
||||
|
||||
if generate_local_ssl_certs {
|
||||
@@ -873,8 +904,13 @@ impl LocalEnv {
|
||||
.context("pageserver init failed")?;
|
||||
}
|
||||
|
||||
ObjectStorage::from_env(&env)
|
||||
.init()
|
||||
.context("object storage init failed")?;
|
||||
|
||||
// setup remote remote location for default LocalFs remote storage
|
||||
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
|
||||
std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
|
||||
|
||||
env.persist_config()
|
||||
}
|
||||
|
||||
107
control_plane/src/object_storage.rs
Normal file
107
control_plane/src/object_storage.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use crate::background_process::{self, start_process, stop_process};
|
||||
use crate::local_env::LocalEnv;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::{Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use std::io::Write;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
|
||||
pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
|
||||
|
||||
pub struct ObjectStorage {
|
||||
pub bin: Utf8PathBuf,
|
||||
pub data_dir: Utf8PathBuf,
|
||||
pub pemfile: Utf8PathBuf,
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
impl ObjectStorage {
|
||||
pub fn from_env(env: &LocalEnv) -> ObjectStorage {
|
||||
ObjectStorage {
|
||||
bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
|
||||
data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
|
||||
pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
|
||||
port: env.object_storage.port,
|
||||
}
|
||||
}
|
||||
|
||||
fn config_path(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.json")
|
||||
}
|
||||
|
||||
fn listen_addr(&self) -> Utf8PathBuf {
|
||||
format!("127.0.0.1:{}", self.port).into()
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
println!("Initializing object storage in {:?}", self.data_dir);
|
||||
let parent = self.data_dir.parent().unwrap();
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Cfg {
|
||||
listen: Utf8PathBuf,
|
||||
pemfile: Utf8PathBuf,
|
||||
local_path: Utf8PathBuf,
|
||||
r#type: String,
|
||||
}
|
||||
let cfg = Cfg {
|
||||
listen: self.listen_addr(),
|
||||
pemfile: parent.join(self.pemfile.clone()),
|
||||
local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
|
||||
r#type: "LocalFs".to_string(),
|
||||
};
|
||||
std::fs::create_dir_all(self.config_path().parent().unwrap())?;
|
||||
std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
|
||||
.context("write object storage config")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
|
||||
println!("Starting s3 proxy at {}", self.listen_addr());
|
||||
std::io::stdout().flush().context("flush stdout")?;
|
||||
|
||||
let process_status_check = || async {
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
let res = reqwest::Client::new()
|
||||
.get(format!("http://{}/metrics", self.listen_addr()))
|
||||
.send()
|
||||
.await;
|
||||
match res {
|
||||
Ok(response) if response.status().is_success() => Ok(true),
|
||||
Ok(_) => Err(anyhow!("Failed to query /metrics")),
|
||||
Err(e) => Err(anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
};
|
||||
|
||||
let res = start_process(
|
||||
"object_storage",
|
||||
&self.data_dir.clone().into_std_path_buf(),
|
||||
&self.bin.clone().into_std_path_buf(),
|
||||
vec![self.config_path().to_string()],
|
||||
vec![("RUST_LOG".into(), "debug".into())],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
process_status_check,
|
||||
)
|
||||
.await;
|
||||
if res.is_err() {
|
||||
eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
stop_process(immediate, "object_storage", &self.pid_file())
|
||||
}
|
||||
|
||||
fn log_file(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.log")
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.pid")
|
||||
}
|
||||
}
|
||||
@@ -318,7 +318,7 @@ impl PageServerNode {
|
||||
self.conf.id, datadir,
|
||||
)
|
||||
})?;
|
||||
let args = vec!["-D", datadir_path_str];
|
||||
let args = vec!["-D", datadir_path_str, "--dev"];
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
|
||||
@@ -162,6 +162,7 @@ impl SafekeeperNode {
|
||||
listen_http,
|
||||
"--availability-zone".to_owned(),
|
||||
availability_zone,
|
||||
"--dev".to_owned(),
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||
|
||||
@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut node_to_fill_descs = Vec::new();
|
||||
|
||||
for desc in node_descs {
|
||||
let to_drain = nodes.iter().any(|id| *id == desc.id);
|
||||
let to_drain = nodes.contains(&desc.id);
|
||||
if to_drain {
|
||||
node_to_drain_descs.push(desc);
|
||||
} else {
|
||||
|
||||
@@ -151,7 +151,7 @@ Example body:
|
||||
```
|
||||
{
|
||||
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
||||
"stripe_size": 32768,
|
||||
"stripe_size": 2048,
|
||||
"shards": [
|
||||
{"node_id": 344, "shard_number": 0},
|
||||
{"node_id": 722, "shard_number": 1},
|
||||
|
||||
@@ -5,6 +5,14 @@ use crate::privilege::Privilege;
|
||||
use crate::responses::ComputeCtlConfig;
|
||||
use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
|
||||
|
||||
/// When making requests to the `compute_ctl` external HTTP server, the client
|
||||
/// must specify a set of claims in `Authorization` header JWTs such that
|
||||
/// `compute_ctl` can authorize the request.
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeClaims {
|
||||
pub compute_id: String,
|
||||
}
|
||||
|
||||
/// Request of the /configure API
|
||||
///
|
||||
/// We now pass only `spec` in the configuration request, but later we can
|
||||
@@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest {
|
||||
pub privileges: Vec<Privilege>,
|
||||
pub role: PgIdent,
|
||||
}
|
||||
|
||||
/// Request of the /configure_telemetry API
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ConfigureTelemetryRequest {
|
||||
pub logs_export_host: Option<String>,
|
||||
}
|
||||
|
||||
@@ -168,6 +168,10 @@ pub struct ComputeSpec {
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
|
||||
/// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
|
||||
/// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
|
||||
pub logs_export_host: Option<String>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -179,9 +183,6 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Allow to configure rsyslog for Postgres logs export
|
||||
PostgresLogsExport,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
|
||||
@@ -613,8 +613,7 @@ mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use super::*;
|
||||
use crate::models::ShardParameters;
|
||||
use crate::shard::{ShardCount, ShardNumber};
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};
|
||||
|
||||
// Helper function to create a key range.
|
||||
//
|
||||
@@ -964,12 +963,8 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn sharded_range_relation_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -985,12 +980,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_single_key() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -1034,12 +1025,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_forkno_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -1061,7 +1048,7 @@ mod tests {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -1144,37 +1131,44 @@ mod tests {
|
||||
/// for a single tenant.
|
||||
#[test]
|
||||
fn sharded_range_fragment_simple() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which we happen to know covers exactly one stripe which belongs to this shard
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
|
||||
let mut input_end = input_start;
|
||||
input_end.field6 += STRIPE_SIZE; // field6 is block number
|
||||
|
||||
// Ask for stripe_size blocks, we get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 32768),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for more, we still get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 10000000),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for target_nblocks of half the stripe size, we get two halves
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16384),
|
||||
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
|
||||
(
|
||||
32768,
|
||||
STRIPE_SIZE,
|
||||
vec![
|
||||
(16384, input_start..input_start.add(16384)),
|
||||
(16384, input_start.add(16384)..input_end)
|
||||
(
|
||||
STRIPE_SIZE / 2,
|
||||
input_start..input_start.add(STRIPE_SIZE / 2)
|
||||
),
|
||||
(STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
|
||||
]
|
||||
)
|
||||
);
|
||||
@@ -1182,40 +1176,53 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_multi_stripe() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let mut input_end = input_start;
|
||||
input_end.field6 += RANGE_SIZE; // field6 is block number
|
||||
|
||||
// Ask for all the blocks, get a fragment that covers the whole range but reports
|
||||
// its size to be just the blocks belonging to our shard.
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 131072),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for a sub-stripe quantity
|
||||
// Ask for a sub-stripe quantity that results in 3 fragments.
|
||||
let limit = STRIPE_SIZE / 3 + 1;
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16000),
|
||||
do_fragment(input_start, input_end, &shard_identity, limit),
|
||||
(
|
||||
32768,
|
||||
STRIPE_SIZE,
|
||||
vec![
|
||||
(16000, input_start..input_start.add(16000)),
|
||||
(16000, input_start.add(16000)..input_start.add(32000)),
|
||||
(768, input_start.add(32000)..input_end),
|
||||
(limit, input_start..input_start.add(limit)),
|
||||
(limit, input_start.add(limit)..input_start.add(2 * limit)),
|
||||
(
|
||||
STRIPE_SIZE - 2 * limit,
|
||||
input_start.add(2 * limit)..input_end
|
||||
),
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
// Try on a range that starts slightly after our owned stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
|
||||
(32767, vec![(32767, input_start.add(1)..input_end)])
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
|
||||
(
|
||||
STRIPE_SIZE - 1,
|
||||
vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1223,32 +1230,40 @@ mod tests {
|
||||
/// a previous relation.
|
||||
#[test]
|
||||
fn sharded_range_fragment_starting_from_logical_size() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
|
||||
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
|
||||
let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
|
||||
input_end.field6 += RANGE_SIZE; // field6 is block number
|
||||
|
||||
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x8001, vec![(0x8001, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
|
||||
(
|
||||
STRIPE_SIZE + 1,
|
||||
vec![(STRIPE_SIZE + 1, input_start..input_end)]
|
||||
)
|
||||
);
|
||||
|
||||
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
|
||||
// store all logical sizes)
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x1, vec![(0x1, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
|
||||
(1, vec![(1, input_start..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1284,12 +1299,8 @@ mod tests {
|
||||
);
|
||||
|
||||
// Same, but using a sharded identity
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
@@ -1331,7 +1342,7 @@ mod tests {
|
||||
ShardIdentity::new(
|
||||
ShardNumber((prng.next_u32() % shard_count) as u8),
|
||||
ShardCount::new(shard_count as u8),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
|
||||
use crate::config::Ratio;
|
||||
use crate::key::{CompactKey, Key};
|
||||
use crate::reltag::RelTag;
|
||||
use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -438,8 +438,6 @@ pub struct ShardParameters {
|
||||
}
|
||||
|
||||
impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count.is_unsharded()
|
||||
}
|
||||
@@ -449,7 +447,7 @@ impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount::new(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1680,6 +1678,7 @@ pub struct SecondaryProgress {
|
||||
pub struct TenantScanRemoteStorageShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub generation: Option<u32>,
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
|
||||
@@ -58,6 +58,8 @@ pub enum NeonWalRecord {
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
/// Only append the record if the current image is the same as the one specified in this field.
|
||||
only_if: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -81,6 +83,17 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
only_if: Some(only_if.as_ref().to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,6 +103,7 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,6 +113,7 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardStripeSize {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
@@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
|
||||
///
|
||||
/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
|
||||
/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
@@ -537,7 +546,7 @@ mod tests {
|
||||
field6: 0x7d06,
|
||||
};
|
||||
|
||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||
let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
|
||||
assert_eq!(shard, ShardNumber(8));
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use std::future::Future;
|
||||
use std::io::ErrorKind;
|
||||
use std::net::SocketAddr;
|
||||
use std::os::fd::{AsRawFd, RawFd};
|
||||
use std::pin::Pin;
|
||||
@@ -227,7 +226,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
Err(io::Error::other("reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
@@ -237,7 +236,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
Err(io::Error::other("reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
@@ -975,7 +974,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
|
||||
.write_message_noflush(&BeMessage::CopyData(buf))
|
||||
// write_message only writes to the buffer, so it can fail iff the
|
||||
// message is invaid, but CopyData can't be invalid.
|
||||
.map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
|
||||
.map_err(|_| io::Error::other("failed to serialize CopyData"))?;
|
||||
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
|
||||
|
||||
static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
|
||||
cert
|
||||
|
||||
rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
|
||||
});
|
||||
|
||||
// test that basic select with ssl works
|
||||
|
||||
@@ -35,7 +35,7 @@ impl ConnectionError {
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
match self {
|
||||
ConnectionError::Io(io) => io,
|
||||
ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
|
||||
ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,7 +257,7 @@ pub enum ProtocolError {
|
||||
impl ProtocolError {
|
||||
/// Proxy stream.rs uses only io::Error; provide it.
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, self.to_string())
|
||||
io::Error::other(self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -212,7 +212,7 @@ impl ScramSha256 {
|
||||
password,
|
||||
channel_binding,
|
||||
} => (nonce, password, channel_binding),
|
||||
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
|
||||
_ => return Err(io::Error::other("invalid SCRAM state")),
|
||||
};
|
||||
|
||||
let message =
|
||||
@@ -291,7 +291,7 @@ impl ScramSha256 {
|
||||
server_key,
|
||||
auth_message,
|
||||
} => (server_key, auth_message),
|
||||
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
|
||||
_ => return Err(io::Error::other("invalid SCRAM state")),
|
||||
};
|
||||
|
||||
let message =
|
||||
@@ -301,10 +301,7 @@ impl ScramSha256 {
|
||||
|
||||
let verifier = match parsed {
|
||||
ServerFinalMessage::Error(e) => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("SCRAM error: {}", e),
|
||||
));
|
||||
return Err(io::Error::other(format!("SCRAM error: {}", e)));
|
||||
}
|
||||
ServerFinalMessage::Verifier(verifier) => verifier,
|
||||
};
|
||||
|
||||
@@ -28,7 +28,7 @@ toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
utils = { path = "../utils", default-features = false }
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
azure_core.workspace = true
|
||||
|
||||
@@ -801,8 +801,7 @@ where
|
||||
// that support needs to be hacked in.
|
||||
//
|
||||
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::other(
|
||||
"cloned or initial values cannot be read",
|
||||
))),
|
||||
}
|
||||
@@ -855,7 +854,7 @@ where
|
||||
};
|
||||
Err(azure_core::error::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||
std::io::Error::other(msg),
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
default = ["rename_noreplace"]
|
||||
rename_noreplace = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
@@ -35,7 +36,7 @@ serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = ["serde"] }
|
||||
|
||||
@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
|
||||
pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
|
||||
let key = EncodingKey::from_ed_pem(key_data)?;
|
||||
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
@@ -81,12 +81,9 @@ pub fn path_with_suffix_extension(
|
||||
}
|
||||
|
||||
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
|
||||
let parent = file_path.parent().ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("File {file_path:?} has no parent"),
|
||||
)
|
||||
})?;
|
||||
let parent = file_path
|
||||
.parent()
|
||||
.ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;
|
||||
|
||||
fsync(file_path)?;
|
||||
fsync(parent)?;
|
||||
|
||||
@@ -3,7 +3,9 @@ use std::{fs, io, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
#[cfg(feature = "rename_noreplace")]
|
||||
mod rename_noreplace;
|
||||
#[cfg(feature = "rename_noreplace")]
|
||||
pub use rename_noreplace::rename_noreplace;
|
||||
|
||||
pub trait PathExt {
|
||||
|
||||
@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
dst: &P2,
|
||||
) -> nix::Result<()> {
|
||||
{
|
||||
#[cfg(target_os = "linux")]
|
||||
#[cfg(all(target_os = "linux", target_env = "gnu"))]
|
||||
{
|
||||
nix::fcntl::renameat2(
|
||||
None,
|
||||
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
})??;
|
||||
nix::errno::Errno::result(res).map(drop)
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
|
||||
{
|
||||
std::compile_error!("OS does not support no-replace renames");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
pub use signal_hook::consts::TERM_SIGNALS;
|
||||
pub use signal_hook::consts::signal::*;
|
||||
use signal_hook::iterator::Signals;
|
||||
use tokio::signal::unix::{SignalKind, signal};
|
||||
use tracing::info;
|
||||
|
||||
pub enum Signal {
|
||||
Quit,
|
||||
@@ -36,3 +38,30 @@ impl ShutdownSignals {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs in a loop since we want to be responsive to multiple signals
|
||||
/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
|
||||
/// <https://github.com/neondatabase/neon/issues/9740>
|
||||
pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
|
||||
let mut sigint = signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = signal(SignalKind::quit()).unwrap();
|
||||
|
||||
loop {
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => "SIGINT",
|
||||
_ = sigterm.recv() => "SIGTERM",
|
||||
};
|
||||
|
||||
if !token.is_cancelled() {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
|
||||
token.cancel();
|
||||
} else {
|
||||
info!("Got signal {signal}. Already shutting down.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
28
object_storage/Cargo.toml
Normal file
28
object_storage/Cargo.toml
Normal file
@@ -0,0 +1,28 @@
|
||||
[package]
|
||||
name = "object_storage"
|
||||
version = "0.0.1"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
axum-extra.workspace = true
|
||||
axum.workspace = true
|
||||
camino.workspace = true
|
||||
futures.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
prometheus.workspace = true
|
||||
remote_storage.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils = { path = "../libs/utils", default-features = false }
|
||||
workspace_hack.workspace = true
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
http-body-util.workspace = true
|
||||
itertools.workspace = true
|
||||
rand.workspace = true
|
||||
test-log.workspace = true
|
||||
tower.workspace = true
|
||||
561
object_storage/src/app.rs
Normal file
561
object_storage/src/app.rs
Normal file
@@ -0,0 +1,561 @@
|
||||
use anyhow::anyhow;
|
||||
use axum::body::{Body, Bytes};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Router, http::StatusCode};
|
||||
use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
|
||||
use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
use utils::backoff::retry;
|
||||
|
||||
pub fn app(state: Arc<Storage>) -> Router<()> {
|
||||
use axum::routing::{delete as _delete, get as _get};
|
||||
let delete_prefix = _delete(delete_prefix);
|
||||
Router::new()
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
|
||||
_get(get).put(set).delete(delete),
|
||||
)
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}",
|
||||
delete_prefix.clone(),
|
||||
)
|
||||
.route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
|
||||
.route("/{tenant_id}", delete_prefix)
|
||||
.route("/metrics", _get(metrics))
|
||||
.route("/status", _get(async || StatusCode::OK.into_response()))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
type Result = anyhow::Result<Response, Response>;
|
||||
type State = axum::extract::State<Arc<Storage>>;
|
||||
|
||||
const CONTENT_TYPE: &str = "content-type";
|
||||
const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
|
||||
const WARN_THRESHOLD: u32 = 3;
|
||||
const MAX_RETRIES: u32 = 10;
|
||||
|
||||
async fn metrics() -> Result {
|
||||
prometheus::TextEncoder::new()
|
||||
.encode_to_string(&prometheus::gather())
|
||||
.map(|s| s.into_response())
|
||||
.map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
|
||||
}
|
||||
|
||||
async fn get(S3Path { path }: S3Path, state: State) -> Result {
|
||||
info!(%path, "downloading");
|
||||
let download_err = |e| {
|
||||
if let DownloadError::NotFound = e {
|
||||
info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
|
||||
return not_found(&path);
|
||||
}
|
||||
internal_error(e, &path, "downloading")
|
||||
};
|
||||
let cancel = state.cancel.clone();
|
||||
let opts = &DownloadOpts::default();
|
||||
|
||||
let stream = retry(
|
||||
async || state.storage.download(&path, opts, &cancel).await,
|
||||
DownloadError::is_permanent,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"downloading",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(DownloadError::Cancelled))
|
||||
.map_err(download_err)?
|
||||
.download_stream;
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
|
||||
.body(Body::from_stream(stream))
|
||||
.map_err(|e| internal_error(e, path, "reading response"))
|
||||
}
|
||||
|
||||
// Best solution for files is multipart upload, but remote_storage doesn't support it,
|
||||
// so we can either read Bytes in memory and push at once or forward BodyDataStream to
|
||||
// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
|
||||
// guaranteed size() which may produce issues while uploading to s3.
|
||||
// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
|
||||
// very large files.
|
||||
async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
|
||||
info!(%path, "uploading");
|
||||
let request_len = bytes.len();
|
||||
let max_len = state.max_upload_file_limit;
|
||||
if request_len > max_len {
|
||||
return Err(bad_request(
|
||||
anyhow!("File size {request_len} exceeds max {max_len}"),
|
||||
"uploading",
|
||||
));
|
||||
}
|
||||
|
||||
let cancel = state.cancel.clone();
|
||||
let fun = async || {
|
||||
let stream = bytes_to_stream(bytes.clone());
|
||||
state
|
||||
.storage
|
||||
.upload(stream, request_len, &path, None, &cancel)
|
||||
.await
|
||||
};
|
||||
retry(
|
||||
fun,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"uploading",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("uploading cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "reading response"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
async fn delete(S3Path { path }: S3Path, state: State) -> Result {
|
||||
info!(%path, "deleting");
|
||||
let cancel = state.cancel.clone();
|
||||
retry(
|
||||
async || state.storage.delete(&path, &cancel).await,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"deleting",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("deleting cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "deleting"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
|
||||
info!(%path, "deleting prefix");
|
||||
let cancel = state.cancel.clone();
|
||||
retry(
|
||||
async || state.storage.delete_prefix(&path, &cancel).await,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"deleting prefix",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("deleting prefix cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "deleting prefix"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
pub async fn check_storage_permissions(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("storage permissions check");
|
||||
|
||||
// as_nanos() as multiple instances proxying same bucket may be started at once
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)?
|
||||
.as_nanos()
|
||||
.to_string();
|
||||
|
||||
let path = RemotePath::from_string(&format!("write_access_{now}"))?;
|
||||
info!(%path, "uploading");
|
||||
|
||||
let body = now.to_string();
|
||||
let stream = bytes_to_stream(Bytes::from(body.clone()));
|
||||
client
|
||||
.upload(stream, body.len(), &path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
use tokio::io::AsyncReadExt;
|
||||
info!(%path, "downloading");
|
||||
let download_opts = DownloadOpts {
|
||||
kind: remote_storage::DownloadKind::Small,
|
||||
..Default::default()
|
||||
};
|
||||
let mut body_read_buf = Vec::new();
|
||||
let stream = client
|
||||
.download(&path, &download_opts, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
tokio_util::io::StreamReader::new(stream)
|
||||
.read_to_end(&mut body_read_buf)
|
||||
.await?;
|
||||
let body_read = String::from_utf8(body_read_buf)?;
|
||||
if body != body_read {
|
||||
error!(%body, %body_read, "File contents do not match");
|
||||
anyhow::bail!("Read back file doesn't match original")
|
||||
}
|
||||
|
||||
info!(%path, "removing");
|
||||
client.delete(&path, &cancel).await
|
||||
}
|
||||
|
||||
fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
|
||||
futures::stream::once(futures::future::ready(Ok(bytes)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use axum::{body::Body, extract::Request, response::Response};
|
||||
use http_body_util::BodyExt;
|
||||
use itertools::iproduct;
|
||||
use std::env::var;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use test_log::test as testlog;
|
||||
use tower::{Service, util::ServiceExt};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// see libs/remote_storage/tests/test_real_s3.rs
|
||||
const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
|
||||
const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
|
||||
|
||||
async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
|
||||
let cancel = CancellationToken::new();
|
||||
let (dir, storage) = if var(REAL_S3_ENV).is_err() {
|
||||
// tests execute in parallel and we need a new directory for each of them
|
||||
let dir = camino_tempfile::tempdir().unwrap();
|
||||
let fs =
|
||||
remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
|
||||
(Some(dir), GenericRemoteStorage::LocalFs(fs))
|
||||
} else {
|
||||
// test_real_s3::create_s3_client is hard to reference, reimplementing here
|
||||
let millis = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
use rand::Rng;
|
||||
let random = rand::thread_rng().r#gen::<u32>();
|
||||
|
||||
let s3_config = remote_storage::S3Config {
|
||||
bucket_name: var(REAL_S3_BUCKET).unwrap(),
|
||||
bucket_region: var(REAL_S3_REGION).unwrap(),
|
||||
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
|
||||
endpoint: None,
|
||||
concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: None,
|
||||
upload_storage_class: None,
|
||||
};
|
||||
let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
|
||||
.await
|
||||
.unwrap();
|
||||
(None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
|
||||
};
|
||||
|
||||
let proxy = Storage {
|
||||
auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
|
||||
storage,
|
||||
cancel: cancel.clone(),
|
||||
max_upload_file_limit: usize::MAX,
|
||||
};
|
||||
check_storage_permissions(&proxy.storage, cancel)
|
||||
.await
|
||||
.unwrap();
|
||||
(proxy, dir)
|
||||
}
|
||||
|
||||
// see libs/utils/src/auth.rs
|
||||
const TEST_PUB_KEY_ED25519: &[u8] = b"
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
|
||||
-----END PUBLIC KEY-----
|
||||
";
|
||||
|
||||
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
-----END PRIVATE KEY-----
|
||||
"#;
|
||||
|
||||
async fn request(req: Request<Body>) -> Response<Body> {
|
||||
let (proxy, _) = proxy().await;
|
||||
app(Arc::new(proxy))
|
||||
.into_service()
|
||||
.oneshot(req)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn status() {
|
||||
let res = Request::builder()
|
||||
.uri("/status")
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
|
||||
iproduct!(
|
||||
vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
|
||||
vec!["GET", "PUT", "DELETE"]
|
||||
)
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn no_token() {
|
||||
for (uri, method) in routes() {
|
||||
info!(%uri, %method);
|
||||
let res = Request::builder()
|
||||
.uri(uri)
|
||||
.method(method)
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert!(matches!(
|
||||
res.status(),
|
||||
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn invalid_token() {
|
||||
for (uri, method) in routes() {
|
||||
info!(%uri, %method);
|
||||
let status = Request::builder()
|
||||
.uri(uri)
|
||||
.header("Authorization", "Bearer 123")
|
||||
.method(method)
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert!(matches!(
|
||||
status.status(),
|
||||
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
const TENANT_ID: TenantId =
|
||||
TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
|
||||
const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
|
||||
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
|
||||
fn token() -> String {
|
||||
let claims = object_storage::Claims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
|
||||
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
|
||||
jsonwebtoken::encode(&header, &claims, &key).unwrap()
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn unauthorized() {
|
||||
let (proxy, _) = proxy().await;
|
||||
let mut app = app(Arc::new(proxy)).into_service();
|
||||
let token = token();
|
||||
let args = itertools::iproduct!(
|
||||
vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
|
||||
vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
|
||||
vec![ENDPOINT_ID, "ep-ololo"]
|
||||
)
|
||||
.skip(1);
|
||||
|
||||
for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
|
||||
info!(%uri, %method, %tenant, %timeline, %endpoint);
|
||||
let request = Request::builder()
|
||||
.uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
|
||||
.method(method)
|
||||
.header("Authorization", format!("Bearer {}", token))
|
||||
.body(Body::empty())
|
||||
.unwrap();
|
||||
let status = ServiceExt::ready(&mut app)
|
||||
.await
|
||||
.unwrap()
|
||||
.call(request)
|
||||
.await
|
||||
.unwrap()
|
||||
.status();
|
||||
assert_eq!(status, StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn method_not_allowed() {
|
||||
let token = token();
|
||||
let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
|
||||
for (key, method) in iter {
|
||||
let status = Request::builder()
|
||||
.uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
|
||||
.method(method)
|
||||
.header("Authorization", format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await
|
||||
.status();
|
||||
assert!(matches!(
|
||||
status,
|
||||
StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
async fn requests_chain(
|
||||
chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
|
||||
token: impl Fn(&str) -> String,
|
||||
) {
|
||||
let (proxy, _) = proxy().await;
|
||||
let mut app = app(Arc::new(proxy)).into_service();
|
||||
for (uri, method, body, expected_status, compare_body) in chain {
|
||||
info!(%uri, %method, %body, %expected_status);
|
||||
let bearer = format!("Bearer {}", token(&uri));
|
||||
let request = Request::builder()
|
||||
.uri(uri)
|
||||
.method(method)
|
||||
.header("Authorization", &bearer)
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
let response = ServiceExt::ready(&mut app)
|
||||
.await
|
||||
.unwrap()
|
||||
.call(request)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(response.status(), expected_status);
|
||||
if !compare_body {
|
||||
continue;
|
||||
}
|
||||
let read_body = response.into_body().collect().await.unwrap().to_bytes();
|
||||
assert_eq!(body, read_body);
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn metrics() {
|
||||
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
|
||||
let req = vec![
|
||||
(uri.clone(), "PUT", "body", StatusCode::OK, false),
|
||||
(uri.clone(), "DELETE", "", StatusCode::OK, false),
|
||||
];
|
||||
requests_chain(req.into_iter(), |_| token()).await;
|
||||
|
||||
let res = Request::builder()
|
||||
.uri("/metrics")
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
let body = res.into_body().collect().await.unwrap().to_bytes();
|
||||
let body = String::from_utf8_lossy(&body);
|
||||
tracing::debug!(%body);
|
||||
// Storage metrics are not gathered for LocalFs
|
||||
if var(REAL_S3_ENV).is_ok() {
|
||||
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
|
||||
}
|
||||
assert!(body.contains("process_threads"));
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn insert_retrieve_remove() {
|
||||
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
|
||||
let chain = vec![
|
||||
(uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
|
||||
(uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
|
||||
(uri.clone(), "DELETE", "", StatusCode::OK, false),
|
||||
(uri, "GET", "", StatusCode::NOT_FOUND, false),
|
||||
];
|
||||
requests_chain(chain.into_iter(), |_| token()).await;
|
||||
}
|
||||
|
||||
fn delete_prefix_token(uri: &str) -> String {
|
||||
use serde::Serialize;
|
||||
let parts = uri.split("/").collect::<Vec<&str>>();
|
||||
#[derive(Serialize)]
|
||||
struct PrefixClaims {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
endpoint_id: Option<object_storage::EndpointId>,
|
||||
exp: u64,
|
||||
}
|
||||
let claims = PrefixClaims {
|
||||
tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
|
||||
timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
|
||||
endpoint_id: parts.get(3).map(ToString::to_string),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
|
||||
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
|
||||
jsonwebtoken::encode(&header, &claims, &key).unwrap()
|
||||
}
|
||||
|
||||
// Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
|
||||
#[testlog(tokio::test)]
|
||||
async fn delete_prefix() {
|
||||
let tenant_id =
|
||||
TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
|
||||
let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
|
||||
// Why extra slash in string literals? Axum is weird with URIs:
|
||||
// /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
|
||||
// as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
|
||||
// The cost of removing trailing slash is suprisingly hard:
|
||||
// * Add tower dependency with NormalizePath layer
|
||||
// * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
|
||||
// * Rewrite make_service() -> into_make_service()
|
||||
// * Rewrite oneshot() (not available for NormalizePath)
|
||||
// I didn't manage to get it working correctly
|
||||
let chain = vec![
|
||||
// create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
|
||||
(f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
// create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
// create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
// create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
|
||||
// create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
|
||||
(f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
|
||||
(f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
// delete prefix 1 -> empty
|
||||
(format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
];
|
||||
requests_chain(chain.into_iter(), delete_prefix_token).await;
|
||||
}
|
||||
}
|
||||
344
object_storage/src/lib.rs
Normal file
344
object_storage/src/lib.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
use anyhow::Result;
|
||||
use axum::extract::{FromRequestParts, Path};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
|
||||
use axum_extra::TypedHeader;
|
||||
use axum_extra::headers::{Authorization, authorization::Bearer};
|
||||
use camino::Utf8PathBuf;
|
||||
use jsonwebtoken::{DecodingKey, Validation};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt::Display;
|
||||
use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// simplified version of utils::auth::JwtAuth
|
||||
pub struct JwtAuth {
|
||||
decoding_key: DecodingKey,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
|
||||
impl JwtAuth {
|
||||
pub fn new(key: &[u8]) -> Result<Self> {
|
||||
Ok(Self {
|
||||
decoding_key: DecodingKey::from_ed_pem(key)?,
|
||||
validation: Validation::new(VALIDATION_ALGO),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
|
||||
Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
|
||||
let key = clean_utf8(&Utf8PathBuf::from(key));
|
||||
if key.starts_with("..") || key == "." || key == "/" {
|
||||
return Err(format!("invalid key {key}"));
|
||||
}
|
||||
match key.strip_prefix("/").map(Utf8PathBuf::from) {
|
||||
Ok(p) => Ok(p),
|
||||
_ => Ok(key),
|
||||
}
|
||||
}
|
||||
|
||||
// Copied from path_clean crate with PathBuf->Utf8PathBuf
|
||||
fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
|
||||
use camino::Utf8Component as Comp;
|
||||
let mut out = Vec::new();
|
||||
for comp in path.components() {
|
||||
match comp {
|
||||
Comp::CurDir => (),
|
||||
Comp::ParentDir => match out.last() {
|
||||
Some(Comp::RootDir) => (),
|
||||
Some(Comp::Normal(_)) => {
|
||||
out.pop();
|
||||
}
|
||||
None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
|
||||
out.push(comp)
|
||||
}
|
||||
},
|
||||
comp => out.push(comp),
|
||||
}
|
||||
}
|
||||
if !out.is_empty() {
|
||||
out.iter().collect()
|
||||
} else {
|
||||
Utf8PathBuf::from(".")
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Storage {
|
||||
pub auth: JwtAuth,
|
||||
pub storage: GenericRemoteStorage,
|
||||
pub cancel: CancellationToken,
|
||||
pub max_upload_file_limit: usize,
|
||||
}
|
||||
|
||||
pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct Claims {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub exp: u64,
|
||||
}
|
||||
|
||||
impl Display for Claims {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
|
||||
self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct KeyRequest {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
endpoint_id: EndpointId,
|
||||
path: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct S3Path {
|
||||
pub path: RemotePath,
|
||||
}
|
||||
|
||||
impl TryFrom<&KeyRequest> for S3Path {
|
||||
type Error = String;
|
||||
fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
|
||||
let KeyRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
endpoint_id,
|
||||
path,
|
||||
} = &req;
|
||||
let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
|
||||
let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
|
||||
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
|
||||
Ok(S3Path { path })
|
||||
}
|
||||
}
|
||||
|
||||
fn unauthorized(route: impl Display, claims: impl Display) -> Response {
|
||||
debug!(%route, %claims, "route doesn't match claims");
|
||||
StatusCode::UNAUTHORIZED.into_response()
|
||||
}
|
||||
|
||||
pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
|
||||
debug!(%err, desc);
|
||||
(StatusCode::BAD_REQUEST, err.to_string()).into_response()
|
||||
}
|
||||
|
||||
pub fn ok() -> Response {
|
||||
StatusCode::OK.into_response()
|
||||
}
|
||||
|
||||
pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
|
||||
error!(%err, %path, desc);
|
||||
StatusCode::INTERNAL_SERVER_ERROR.into_response()
|
||||
}
|
||||
|
||||
pub fn not_found(key: impl ToString) -> Response {
|
||||
(StatusCode::NOT_FOUND, key.to_string()).into_response()
|
||||
}
|
||||
|
||||
impl FromRequestParts<Arc<Storage>> for S3Path {
|
||||
type Rejection = Response;
|
||||
async fn from_request_parts(
|
||||
parts: &mut Parts,
|
||||
state: &Arc<Storage>,
|
||||
) -> Result<Self, Self::Rejection> {
|
||||
let Path(path): Path<KeyRequest> = parts
|
||||
.extract()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid route"))?;
|
||||
let TypedHeader(Authorization(bearer)) = parts
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: Claims = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "decoding token"))?;
|
||||
let route = Claims {
|
||||
tenant_id: path.tenant_id,
|
||||
timeline_id: path.timeline_id,
|
||||
endpoint_id: path.endpoint_id.clone(),
|
||||
exp: claims.exp,
|
||||
};
|
||||
if route != claims {
|
||||
return Err(unauthorized(route, claims));
|
||||
}
|
||||
(&path)
|
||||
.try_into()
|
||||
.map_err(|e| bad_request(e, "invalid route"))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct PrefixKeyPath {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
}
|
||||
|
||||
impl Display for PrefixKeyPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
|
||||
self.tenant_id,
|
||||
self.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string()),
|
||||
self.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string())
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PrefixS3Path {
|
||||
pub path: RemotePath,
|
||||
}
|
||||
|
||||
impl From<&PrefixKeyPath> for PrefixS3Path {
|
||||
fn from(path: &PrefixKeyPath) -> Self {
|
||||
let timeline_id = path
|
||||
.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string());
|
||||
let endpoint_id = path
|
||||
.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string());
|
||||
let path = Utf8PathBuf::from(path.tenant_id.to_string())
|
||||
.join(timeline_id)
|
||||
.join(endpoint_id);
|
||||
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
|
||||
PrefixS3Path { path }
|
||||
}
|
||||
}
|
||||
|
||||
impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
|
||||
type Rejection = Response;
|
||||
async fn from_request_parts(
|
||||
parts: &mut Parts,
|
||||
state: &Arc<Storage>,
|
||||
) -> Result<Self, Self::Rejection> {
|
||||
let Path(path) = parts
|
||||
.extract::<Path<PrefixKeyPath>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid route"))?;
|
||||
let TypedHeader(Authorization(bearer)) = parts
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: PrefixKeyPath = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
if path != claims {
|
||||
return Err(unauthorized(path, claims));
|
||||
}
|
||||
Ok((&path).into())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn normalize_key() {
|
||||
let f = super::normalize_key;
|
||||
assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
|
||||
assert_eq!(
|
||||
f("ololo/1/../../not_ololo").unwrap(),
|
||||
Utf8PathBuf::from("not_ololo")
|
||||
);
|
||||
assert!(f("ololo/1/../../../").is_err());
|
||||
assert!(f(".").is_err());
|
||||
assert!(f("../").is_err());
|
||||
assert!(f("").is_err());
|
||||
assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
|
||||
assert!(f("/1/2/3/../../../").is_err());
|
||||
assert!(f("/1/2/3/../../../../").is_err());
|
||||
}
|
||||
|
||||
const TENANT_ID: TenantId =
|
||||
TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
|
||||
const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
|
||||
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
|
||||
|
||||
#[test]
|
||||
fn s3_path() {
|
||||
let auth = Claims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let s3_path = |key| {
|
||||
let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
|
||||
let path = RemotePath::from_string(path).unwrap();
|
||||
S3Path { path }
|
||||
};
|
||||
|
||||
let path = "cache_key".to_string();
|
||||
let mut key_path = KeyRequest {
|
||||
path,
|
||||
tenant_id: auth.tenant_id,
|
||||
timeline_id: auth.timeline_id,
|
||||
endpoint_id: auth.endpoint_id,
|
||||
};
|
||||
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
|
||||
|
||||
key_path.path = "we/can/have/nested/paths".to_string();
|
||||
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
|
||||
|
||||
key_path.path = "../error/hello/../".to_string();
|
||||
assert!(S3Path::try_from(&key_path).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_s3_path() {
|
||||
let mut path = PrefixKeyPath {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: None,
|
||||
endpoint_id: None,
|
||||
};
|
||||
let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}"))
|
||||
);
|
||||
|
||||
path.timeline_id = Some(TIMELINE_ID);
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
|
||||
);
|
||||
|
||||
path.endpoint_id = Some(ENDPOINT_ID.into());
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
|
||||
);
|
||||
}
|
||||
}
|
||||
65
object_storage/src/main.rs
Normal file
65
object_storage/src/main.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
//! `object_storage` is a service which provides API for uploading and downloading
|
||||
//! files. It is used by compute and control plane for accessing LFC prewarm data.
|
||||
//! This service is deployed either as a separate component or as part of compute image
|
||||
//! for large computes.
|
||||
mod app;
|
||||
use anyhow::Context;
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
//see set()
|
||||
const fn max_upload_file_limit() -> usize {
|
||||
100 * 1024 * 1024
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
struct Config {
|
||||
listen: std::net::SocketAddr,
|
||||
pemfile: camino::Utf8PathBuf,
|
||||
#[serde(flatten)]
|
||||
storage_config: remote_storage::RemoteStorageConfig,
|
||||
#[serde(default = "max_upload_file_limit")]
|
||||
max_upload_file_limit: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let config: String = std::env::args().skip(1).take(1).collect();
|
||||
if config.is_empty() {
|
||||
anyhow::bail!("Usage: object_storage config.json")
|
||||
}
|
||||
info!("Reading config from {config}");
|
||||
let config = std::fs::read_to_string(config.clone())?;
|
||||
let config: Config = serde_json::from_str(&config).context("parsing config")?;
|
||||
info!("Reading pemfile from {}", config.pemfile.clone());
|
||||
let pemfile = std::fs::read(config.pemfile.clone())?;
|
||||
info!("Loading public key from {}", config.pemfile.clone());
|
||||
let auth = object_storage::JwtAuth::new(&pemfile)?;
|
||||
|
||||
let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
|
||||
info!("listening on {}", listener.local_addr().unwrap());
|
||||
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
|
||||
let proxy = std::sync::Arc::new(object_storage::Storage {
|
||||
auth,
|
||||
storage,
|
||||
cancel: cancel.clone(),
|
||||
max_upload_file_limit: config.max_upload_file_limit,
|
||||
});
|
||||
|
||||
tokio::spawn(utils::signals::signal_handler(cancel.clone()));
|
||||
axum::serve(listener, app::app(proxy))
|
||||
.with_graceful_shutdown(async move { cancel.cancelled().await })
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::config::PageServerConf;
|
||||
use pageserver::walredo::PostgresRedoManager;
|
||||
use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -223,7 +223,14 @@ impl Request {
|
||||
|
||||
// TODO: avoid these clones
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.request_redo(
|
||||
*key,
|
||||
*lsn,
|
||||
base_img.clone(),
|
||||
records.clone(),
|
||||
*pg_version,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.await
|
||||
.context("request_redo")
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use anyhow::{Context, anyhow, bail};
|
||||
use camino::Utf8Path;
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
@@ -31,7 +31,6 @@ use pageserver::{
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -80,6 +79,8 @@ fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let dev_mode = arg_matches.get_flag("dev");
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
@@ -100,6 +101,20 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
|
||||
|
||||
if !dev_mode {
|
||||
if matches!(conf.http_auth_type, AuthType::Trust)
|
||||
|| matches!(conf.pg_auth_type, AuthType::Trust)
|
||||
{
|
||||
bail!(
|
||||
"Pageserver refuses to start with HTTP or PostgreSQL API authentication disabled.\n\
|
||||
Run with --dev to allow running without authentication.\n\
|
||||
This is insecure and should only be used in development environments."
|
||||
);
|
||||
}
|
||||
} else {
|
||||
warn!("Starting in dev mode: this may be an insecure configuration.");
|
||||
}
|
||||
|
||||
// Initialize logging.
|
||||
//
|
||||
// It must be initialized before the custom panic hook is installed below.
|
||||
@@ -744,32 +759,7 @@ fn start_pageserver(
|
||||
let signal_token = CancellationToken::new();
|
||||
let signal_cancel = signal_token.child_token();
|
||||
|
||||
// Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
|
||||
// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
|
||||
// https://github.com/neondatabase/neon/issues/9740.
|
||||
tokio::spawn(async move {
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||
|
||||
loop {
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => "SIGINT",
|
||||
_ = sigterm.recv() => "SIGTERM",
|
||||
};
|
||||
|
||||
if !signal_token.is_cancelled() {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
|
||||
signal_token.cancel();
|
||||
} else {
|
||||
info!("Got signal {signal}. Already shutting down.");
|
||||
}
|
||||
}
|
||||
});
|
||||
tokio::spawn(utils::signals::signal_handler(signal_token));
|
||||
|
||||
// Wait for cancellation signal and shut down the pageserver.
|
||||
//
|
||||
@@ -842,6 +832,12 @@ fn cli() -> Command {
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dev")
|
||||
.long("dev")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in development mode (disables security checks)"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -212,6 +212,12 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"412":
|
||||
description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
|
||||
@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::index::GcCompactionState;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_index_part, list_remote_tenant_shards, list_remote_timelines,
|
||||
download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
"Lsn calculations by timestamp are only available on shard zero"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
"Timestamp calculations by lsn are only available on shard zero"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
|
||||
.to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||
None => Err(ApiError::PreconditionFailed(
|
||||
format!("Timestamp for lsn {} not found", lsn).into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -2274,6 +2274,7 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
// Manual compaction does not yield for L0.
|
||||
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
@@ -2911,9 +2912,22 @@ async fn tenant_scan_remote_handler(
|
||||
};
|
||||
}
|
||||
|
||||
let result =
|
||||
download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
|
||||
.instrument(info_span!("download_tenant_manifest",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
let stripe_size = match result {
|
||||
Ok((manifest, _, _)) => manifest.stripe_size,
|
||||
Err(DownloadError::NotFound) => None,
|
||||
Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
|
||||
};
|
||||
|
||||
response.shards.push(TenantScanRemoteStorageShard {
|
||||
tenant_shard_id,
|
||||
generation: generation.into(),
|
||||
stripe_size,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3368,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup(
|
||||
|
||||
let broker_client = state.broker_client.clone();
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
let mut body = StreamReader::new(
|
||||
request
|
||||
.into_body()
|
||||
.map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
|
||||
);
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -3446,7 +3460,7 @@ async fn put_tenant_timeline_import_wal(
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
std::io::Error::other( anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
|
||||
|
||||
@@ -691,7 +691,7 @@ impl Timeline {
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
/// Does the slru segment exist?
|
||||
pub(crate) async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
@@ -844,9 +844,9 @@ impl Timeline {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
/// Obtain the timestamp for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
/// If the lsn has no timestamps (e.g. no commits), returns None.
|
||||
pub(crate) async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
|
||||
@@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walingest::WalLagCooldown;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::walredo::{PostgresRedoManager, RedoAttemptType};
|
||||
use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};
|
||||
|
||||
static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
|
||||
@@ -473,15 +473,16 @@ impl WalRedoManager {
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(_, mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
|
||||
.await
|
||||
}
|
||||
#[cfg(test)]
|
||||
Self::Test(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -920,6 +921,7 @@ enum StartCreatingTimelineResult {
|
||||
Idempotent(Arc<Timeline>),
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum TimelineInitAndSyncResult {
|
||||
ReadyToActivate(Arc<Timeline>),
|
||||
NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
|
||||
@@ -1006,6 +1008,7 @@ enum CreateTimelineCause {
|
||||
Delete,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum LoadTimelineCause {
|
||||
Attach,
|
||||
Unoffload,
|
||||
@@ -4079,6 +4082,7 @@ impl Tenant {
|
||||
|
||||
TenantManifest {
|
||||
version: LATEST_TENANT_MANIFEST_VERSION,
|
||||
stripe_size: Some(self.get_shard_stripe_size()),
|
||||
offloaded_timelines,
|
||||
}
|
||||
}
|
||||
@@ -4398,10 +4402,7 @@ impl Tenant {
|
||||
.to_string();
|
||||
|
||||
fail::fail_point!("tenant-config-before-write", |_| {
|
||||
Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"tenant-config-before-write",
|
||||
))
|
||||
Err(std::io::Error::other("tenant-config-before-write"))
|
||||
});
|
||||
|
||||
// Convert the config to a toml file.
|
||||
@@ -5879,6 +5880,7 @@ pub(crate) mod harness {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
_redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, walredo::Error> {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
@@ -8733,6 +8735,21 @@ mod tests {
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("i")),
|
||||
),
|
||||
(
|
||||
get_key(4),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("1")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")),
|
||||
),
|
||||
];
|
||||
let image1 = vec![(get_key(1), "0x10".into())];
|
||||
|
||||
@@ -8763,8 +8780,18 @@ mod tests {
|
||||
|
||||
// Need to remove the limit of "Neon WAL redo requires base image".
|
||||
|
||||
// assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
// assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
assert_eq!(
|
||||
tline.get(get_key(3), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"c")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(get_key(4), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"ij")
|
||||
);
|
||||
|
||||
// Manual testing required: currently, read errors will panic the process in debug mode. So we
|
||||
// cannot enable this assertion in the unit test.
|
||||
// assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::Error;
|
||||
|
||||
use async_compression::Level;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
@@ -331,10 +331,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
return (
|
||||
(
|
||||
io_buf.slice_len(),
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
Err(Error::other(format!("blob too large ({len} bytes)"))),
|
||||
),
|
||||
srcbuf,
|
||||
);
|
||||
|
||||
@@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
.map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -14,6 +15,12 @@ pub struct TenantManifest {
|
||||
/// allow release rollbacks.
|
||||
pub version: usize,
|
||||
|
||||
/// This tenant's stripe size. This is only advisory, and used to recover tenant data from
|
||||
/// remote storage. The autoritative source is the storage controller. If None, assume the
|
||||
/// original default value of 32768 blocks (256 MB).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
/// The list of offloaded timelines together with enough information
|
||||
/// to not have to actually load them.
|
||||
///
|
||||
@@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest {
|
||||
|
||||
/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
|
||||
/// do not use deny_unknown_fields, so new fields are not breaking.
|
||||
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
|
||||
///
|
||||
/// 1: initial version
|
||||
/// 2: +stripe_size
|
||||
///
|
||||
/// When adding new versions, also add a parse_vX test case below.
|
||||
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
|
||||
|
||||
impl TenantManifest {
|
||||
/// Returns true if the manifests are equal, ignoring the version number. This avoids
|
||||
@@ -56,10 +68,11 @@ impl TenantManifest {
|
||||
// We could alternatively just clone and modify the version here.
|
||||
let Self {
|
||||
version: _, // ignore version
|
||||
stripe_size,
|
||||
offloaded_timelines,
|
||||
} = self;
|
||||
|
||||
offloaded_timelines == &other.offloaded_timelines
|
||||
stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
|
||||
}
|
||||
|
||||
/// Decodes a manifest from JSON.
|
||||
@@ -89,6 +102,7 @@ mod tests {
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 0,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: Vec::new(),
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
@@ -104,6 +118,7 @@ mod tests {
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 1,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: Vec::new(),
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
@@ -130,6 +145,50 @@ mod tests {
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 1,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: vec![
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_retain_lsn: None,
|
||||
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
|
||||
},
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
|
||||
ancestor_timeline_id: Some(TimelineId::from_str(
|
||||
"5c4df612fd159e63c1b7853fe94d97da",
|
||||
)?),
|
||||
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
|
||||
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
|
||||
},
|
||||
],
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// v2 manifests should be parsed, for backwards compatibility.
|
||||
#[test]
|
||||
fn parse_v2() -> anyhow::Result<()> {
|
||||
let json = r#"{
|
||||
"version": 2,
|
||||
"stripe_size": 32768,
|
||||
"offloaded_timelines": [
|
||||
{
|
||||
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"archived_at": "2025-03-07T11:07:11.373105434"
|
||||
},
|
||||
{
|
||||
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
|
||||
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"ancestor_retain_lsn": "0/1F79038",
|
||||
"archived_at": "2025-03-05T11:10:22.257901390"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 2,
|
||||
stripe_size: Some(ShardStripeSize(32768)),
|
||||
offloaded_timelines: vec![
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
|
||||
|
||||
@@ -366,7 +366,7 @@ impl SplitDeltaLayerWriter {
|
||||
)
|
||||
.await?;
|
||||
let (start_key, prev_delta_writer) =
|
||||
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
||||
self.inner.replace((key, next_delta_writer)).unwrap();
|
||||
self.batches.add_unfinished_delta_writer(
|
||||
prev_delta_writer,
|
||||
start_key..key,
|
||||
|
||||
@@ -766,7 +766,7 @@ mod tests {
|
||||
rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
|
||||
Ok((dst, len))
|
||||
}
|
||||
Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
|
||||
Err(e) => Err(std::io::Error::other(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ impl LayerIterRef<'_> {
|
||||
/// 1. Unified iterator for image and delta layers.
|
||||
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
|
||||
/// 3. Lazy creation of the real delta/image iterator.
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub(crate) enum IteratorWrapper<'a> {
|
||||
NotLoaded {
|
||||
ctx: &'a RequestContext,
|
||||
|
||||
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, Result, anyhow, bail, ensure};
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
@@ -115,7 +116,7 @@ use crate::pgdatadir_mapping::{
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::{LayerMap, SearchResult};
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
@@ -1039,6 +1040,7 @@ pub(crate) enum ShutdownMode {
|
||||
Hard,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum ImageLayerCreationOutcome {
|
||||
/// We generated an image layer
|
||||
Generated {
|
||||
@@ -1292,6 +1294,12 @@ impl Timeline {
|
||||
};
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
|
||||
RedoAttemptType::LegacyCompaction
|
||||
} else {
|
||||
RedoAttemptType::ReadPage
|
||||
};
|
||||
|
||||
let traversal_res: Result<(), _> = {
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
@@ -1379,7 +1387,7 @@ impl Timeline {
|
||||
|
||||
let walredo_deltas = converted.num_deltas();
|
||||
let walredo_res = walredo_self
|
||||
.reconstruct_value(key, lsn, converted)
|
||||
.reconstruct_value(key, lsn, converted, redo_attempt_type)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
@@ -4104,12 +4112,6 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<TimelineVisitOutcome, GetVectoredError> {
|
||||
let mut unmapped_keyspace = keyspace.clone();
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
// Prevent GC from progressing while visiting the current timeline.
|
||||
// If we are GC-ing because a new image layer was added while traversing
|
||||
// the timeline, then it will remove layers that are required for fulfilling
|
||||
@@ -4120,11 +4122,44 @@ impl Timeline {
|
||||
// See `compaction::compact_with_gc` for why we need this.
|
||||
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
|
||||
|
||||
loop {
|
||||
// Initialize the fringe
|
||||
let mut fringe = {
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
fringe
|
||||
};
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
|
||||
// Visit the layer and plan IOs for it
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
|
||||
let (keys_done_last_step, keys_with_image_coverage) =
|
||||
reconstruct_state.consume_done_keys();
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
@@ -4135,31 +4170,15 @@ impl Timeline {
|
||||
image_covered_keyspace.add_range(keys_with_image_coverage);
|
||||
}
|
||||
|
||||
// Query the layer map for the next layers to read.
|
||||
//
|
||||
// Do not descent any further if the last layer we visited
|
||||
// completed all keys in the keyspace it inspected. This is not
|
||||
// required for correctness, but avoids visiting extra layers
|
||||
// which turns out to be a perf bottleneck in some cases.
|
||||
if !unmapped_keyspace.is_empty() {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
guard.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
// The fringe keeps readable handles for the layers which are safe to read even
|
||||
@@ -4173,28 +4192,6 @@ impl Timeline {
|
||||
// at two different time points.
|
||||
drop(guard);
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TimelineVisitOutcome {
|
||||
@@ -6353,37 +6350,21 @@ impl Timeline {
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
async fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
data: ValueReconstructState,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.reconstruct_value_inner(key, request_lsn, data, false)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
|
||||
/// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
|
||||
async fn reconstruct_value_wo_critical_error(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
data: ValueReconstructState,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.reconstruct_value_inner(key, request_lsn, data, true)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn reconstruct_value_inner(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
no_critical_error: bool,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
|
||||
let fire_critical_error = match redo_attempt_type {
|
||||
RedoAttemptType::ReadPage => true,
|
||||
RedoAttemptType::LegacyCompaction => true,
|
||||
RedoAttemptType::GcCompaction => false,
|
||||
};
|
||||
|
||||
// If we have a page image, and no WAL, we're all set
|
||||
if data.records.is_empty() {
|
||||
if let Some((img_lsn, img)) = &data.img {
|
||||
@@ -6430,13 +6411,20 @@ impl Timeline {
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.request_redo(
|
||||
key,
|
||||
request_lsn,
|
||||
data.img,
|
||||
data.records,
|
||||
self.pg_version,
|
||||
redo_attempt_type,
|
||||
)
|
||||
.await;
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(err)) => {
|
||||
if !no_critical_error {
|
||||
if fire_critical_error {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
}
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
@@ -16,6 +16,8 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
@@ -315,6 +317,9 @@ impl GcCompactionQueue {
|
||||
flags: {
|
||||
let mut flags = EnumSet::new();
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
if timeline.get_compaction_l0_first() {
|
||||
flags |= CompactFlags::YieldForL0;
|
||||
}
|
||||
flags
|
||||
},
|
||||
sub_compaction: true,
|
||||
@@ -819,15 +824,16 @@ pub struct CompactionStatistics {
|
||||
time_acquire_lock_secs: f64,
|
||||
time_analyze_secs: f64,
|
||||
time_download_layer_secs: f64,
|
||||
time_to_first_kv_pair_secs: f64,
|
||||
time_main_loop_secs: f64,
|
||||
time_final_phase_secs: f64,
|
||||
time_total_secs: f64,
|
||||
|
||||
// Summary
|
||||
/// Ratio of the key-value size before/after gc-compaction.
|
||||
uncompressed_size_ratio: f64,
|
||||
/// Ratio of the physical size before/after gc-compaction.
|
||||
physical_size_ratio: f64,
|
||||
/// Ratio of the key-value size after/before gc-compaction.
|
||||
uncompressed_retention_ratio: f64,
|
||||
/// Ratio of the physical size after/before gc-compaction.
|
||||
compressed_retention_ratio: f64,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
@@ -896,15 +902,15 @@ impl CompactionStatistics {
|
||||
fn finalize(&mut self) {
|
||||
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
|
||||
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
|
||||
self.uncompressed_size_ratio =
|
||||
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
self.uncompressed_retention_ratio =
|
||||
produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
|
||||
let produced_physical_size = self.image_layer_produced.size
|
||||
+ self.delta_layer_produced.size
|
||||
+ self.image_layer_discarded.size
|
||||
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
|
||||
self.physical_size_ratio =
|
||||
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
|
||||
self.compressed_retention_ratio =
|
||||
produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2411,7 +2417,7 @@ impl Timeline {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self
|
||||
.reconstruct_value_wo_critical_error(key, request_lsn, state)
|
||||
.reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction)
|
||||
.await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
@@ -3032,7 +3038,7 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let time_download_layer = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
let mut timer = Instant::now();
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
let mut accumulated_values = Vec::new();
|
||||
@@ -3107,6 +3113,7 @@ impl Timeline {
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
let mut time_to_first_kv_pair = None;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
@@ -3114,6 +3121,11 @@ impl Timeline {
|
||||
.context("failed to get next key-value pair")
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
if time_to_first_kv_pair.is_none() {
|
||||
time_to_first_kv_pair = Some(timer.elapsed());
|
||||
timer = Instant::now();
|
||||
}
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
@@ -3449,6 +3461,9 @@ impl Timeline {
|
||||
let time_final_phase = timer.elapsed();
|
||||
|
||||
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
|
||||
stat.time_to_first_kv_pair_secs = time_to_first_kv_pair
|
||||
.unwrap_or(Duration::ZERO)
|
||||
.as_secs_f64();
|
||||
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
|
||||
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
|
||||
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
|
||||
@@ -3909,8 +3924,6 @@ impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
|
||||
}
|
||||
}
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
|
||||
impl CompactionLayer<Key> for ResidentDeltaLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.0.layer_desc().key_range
|
||||
|
||||
@@ -3,17 +3,18 @@ use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::{AtomicLsn, Lsn};
|
||||
|
||||
use super::{ReadableLayer, TimelineWriterState};
|
||||
use super::{LayerFringe, ReadableLayer, TimelineWriterState};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
@@ -38,7 +39,7 @@ impl Default for LayerManager {
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
match weak {
|
||||
ReadableLayerWeak::PersistentLayer(desc) => {
|
||||
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
|
||||
@@ -147,6 +148,36 @@ impl LayerManager {
|
||||
self.layers().keys().cloned().collect_vec()
|
||||
}
|
||||
|
||||
/// Update the [`LayerFringe`] of a read request
|
||||
///
|
||||
/// Take a key space at a given LSN and query the layer map below each range
|
||||
/// of the key space to find the next layers to visit.
|
||||
pub(crate) fn update_search_fringe(
|
||||
&self,
|
||||
keyspace: &KeySpace,
|
||||
cont_lsn: Lsn,
|
||||
fringe: &mut LayerFringe,
|
||||
) -> Result<(), Shutdown> {
|
||||
let map = self.layer_map()?;
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let results = map.range_search(range.clone(), cont_lsn);
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
self.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
|
||||
@@ -445,7 +445,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
@@ -577,7 +577,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable {
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub enum UploadQueueStopped {
|
||||
Deletable(UploadQueueStoppedDeletable),
|
||||
Uninitialized,
|
||||
|
||||
@@ -136,6 +136,16 @@ macro_rules! bail {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RedoAttemptType {
|
||||
/// Used for the read path. Will fire critical errors and retry twice if failure.
|
||||
ReadPage,
|
||||
// Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure.
|
||||
LegacyCompaction,
|
||||
// Used for gc compaction. Will not fire critical errors and not retry.
|
||||
GcCompaction,
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
@@ -156,11 +166,18 @@ impl PostgresRedoManager {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, Error> {
|
||||
if records.is_empty() {
|
||||
bail!("invalid WAL redo request with no records");
|
||||
}
|
||||
|
||||
let max_retry_attempts = match redo_attempt_type {
|
||||
RedoAttemptType::ReadPage => 2,
|
||||
RedoAttemptType::LegacyCompaction => 1,
|
||||
RedoAttemptType::GcCompaction => 0,
|
||||
};
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
let mut img = base_img.map(|p| p.1);
|
||||
let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
|
||||
@@ -180,6 +197,7 @@ impl PostgresRedoManager {
|
||||
&records[batch_start..i],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
)
|
||||
.await
|
||||
};
|
||||
@@ -201,6 +219,7 @@ impl PostgresRedoManager {
|
||||
&records[batch_start..],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -424,11 +443,11 @@ impl PostgresRedoManager {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
max_retry_attempts: u32,
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let base_img = &base_img;
|
||||
@@ -486,7 +505,7 @@ impl PostgresRedoManager {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
if n_attempts > max_retry_attempts || result.is_ok() {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -560,6 +579,7 @@ mod tests {
|
||||
|
||||
use super::PostgresRedoManager;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
@@ -593,6 +613,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
@@ -621,6 +642,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
@@ -642,6 +664,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
|
||||
@@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon(
|
||||
append,
|
||||
clear,
|
||||
will_init,
|
||||
only_if,
|
||||
} => {
|
||||
use bytes::BufMut;
|
||||
if *will_init {
|
||||
@@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon(
|
||||
if *clear {
|
||||
page.clear();
|
||||
}
|
||||
if let Some(only_if) = only_if {
|
||||
if page != only_if.as_bytes() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"the current image does not match the expected image, cannot append"
|
||||
));
|
||||
}
|
||||
}
|
||||
page.put_slice(append.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
communicator.o \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
|
||||
2504
pgxn/neon/communicator.c
Normal file
2504
pgxn/neon/communicator.c
Normal file
File diff suppressed because it is too large
Load Diff
48
pgxn/neon/communicator.h
Normal file
48
pgxn/neon/communicator.h
Normal file
@@ -0,0 +1,48 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* communicator.h
|
||||
* internal interface for communicating with remote pageservers
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef COMMUNICATOR_h
|
||||
#define COMMUNICATOR_h
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
|
||||
/* initialization at postmaster startup */
|
||||
extern void pg_init_communicator(void);
|
||||
|
||||
/* initialization at backend startup */
|
||||
extern void communicator_init(void);
|
||||
|
||||
extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
neon_request_lsns *request_lsns);
|
||||
extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum,
|
||||
neon_request_lsns *request_lsns);
|
||||
extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns);
|
||||
extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber base_blockno, neon_request_lsns *request_lsns,
|
||||
void **buffers, BlockNumber nblocks, const bits8 *mask);
|
||||
extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_request_lsns *lsns,
|
||||
BlockNumber nblocks, void **buffers, bits8 *mask);
|
||||
extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
BlockNumber nblocks, const bits8 *mask);
|
||||
extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
|
||||
neon_request_lsns *request_lsns,
|
||||
void *buffer);
|
||||
|
||||
extern void communicator_reconfigure_timeout_if_needed(void);
|
||||
extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
|
||||
|
||||
|
||||
#endif
|
||||
@@ -21,7 +21,6 @@
|
||||
#include "access/xlog.h"
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "pgstat.h"
|
||||
#include "port/pg_iovec.h"
|
||||
@@ -43,6 +42,7 @@
|
||||
|
||||
#include "hll.h"
|
||||
#include "bitmap.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
|
||||
52
pgxn/neon/file_cache.h
Normal file
52
pgxn/neon/file_cache.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* file_cache.h
|
||||
* Local File Cache definitions
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef FILE_CACHE_h
|
||||
#define FILE_CACHE_h
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
/* GUCs */
|
||||
extern bool lfc_store_prefetch_result;
|
||||
|
||||
/* functions for local file cache */
|
||||
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, const void *const *buffers,
|
||||
BlockNumber nblocks);
|
||||
/* returns number of blocks read, with one bit set in *read for each */
|
||||
extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, void **buffers,
|
||||
BlockNumber nblocks, bits8 *mask);
|
||||
|
||||
extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno);
|
||||
extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, int nblocks, bits8 *bitmap);
|
||||
extern void lfc_init(void);
|
||||
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
const void* buffer, XLogRecPtr lsn);
|
||||
|
||||
|
||||
static inline bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
void *buffer)
|
||||
{
|
||||
bits8 rv = 0;
|
||||
return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
const void *buffer)
|
||||
{
|
||||
return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
|
||||
}
|
||||
|
||||
#endif /* FILE_CACHE_H */
|
||||
@@ -1475,6 +1475,4 @@ pg_init_libpagestore(void)
|
||||
}
|
||||
|
||||
memset(page_servers, 0, sizeof(page_servers));
|
||||
|
||||
lfc_init();
|
||||
}
|
||||
|
||||
@@ -28,7 +28,9 @@
|
||||
#include "utils/guc.h"
|
||||
#include "utils/guc_tables.h"
|
||||
|
||||
#include "communicator.h"
|
||||
#include "extension_server.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "control_plane_connector.h"
|
||||
@@ -434,10 +436,11 @@ _PG_init(void)
|
||||
#endif
|
||||
|
||||
pg_init_libpagestore();
|
||||
lfc_init();
|
||||
pg_init_walproposer();
|
||||
init_lwlsncache();
|
||||
|
||||
pagestore_smgr_init();
|
||||
pg_init_communicator();
|
||||
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitUnstableExtensionsSupport();
|
||||
|
||||
@@ -47,9 +47,18 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL;
|
||||
#define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ
|
||||
#endif
|
||||
|
||||
|
||||
#define NEON_TAG "[NEON_SMGR] "
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
extern void pagestore_smgr_init(void);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void SetNeonCurrentClusterSize(uint64 size);
|
||||
|
||||
@@ -58,14 +58,6 @@ typedef struct
|
||||
|
||||
#define messageTag(m) (((const NeonMessage *)(m))->tag)
|
||||
|
||||
#define NEON_TAG "[NEON_SMGR] "
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
|
||||
/* SLRUs downloadable from page server */
|
||||
typedef enum {
|
||||
SLRU_CLOG,
|
||||
@@ -234,7 +226,6 @@ extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern int32 max_cluster_size;
|
||||
extern int neon_protocol_version;
|
||||
extern bool lfc_store_prefetch_result;
|
||||
|
||||
extern shardno_t get_shard_number(BufferTag* tag);
|
||||
|
||||
@@ -242,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
|
||||
extern void smgr_init_neon(void);
|
||||
extern void readahead_buffer_resize(int newsize, void *extra);
|
||||
|
||||
|
||||
/*
|
||||
* LSN values associated with each request to the pageserver
|
||||
*/
|
||||
@@ -278,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum,
|
||||
neon_request_lsns request_lsns, void *buffer);
|
||||
extern int64 neon_dbsize(Oid dbNode);
|
||||
|
||||
extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
|
||||
BlockNumber blkno, neon_request_lsns *output,
|
||||
BlockNumber nblocks);
|
||||
|
||||
/* utils for neon relsize cache */
|
||||
extern void relsize_hash_init(void);
|
||||
extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
|
||||
@@ -285,37 +281,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
|
||||
extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
|
||||
extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
|
||||
/* functions for local file cache */
|
||||
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, const void *const *buffers,
|
||||
BlockNumber nblocks);
|
||||
/* returns number of blocks read, with one bit set in *read for each */
|
||||
extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, void **buffers,
|
||||
BlockNumber nblocks, bits8 *mask);
|
||||
|
||||
extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno);
|
||||
extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, int nblocks, bits8 *bitmap);
|
||||
extern void lfc_init(void);
|
||||
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
const void* buffer, XLogRecPtr lsn);
|
||||
|
||||
|
||||
static inline bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
void *buffer)
|
||||
{
|
||||
bits8 rv = 0;
|
||||
return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
const void *buffer)
|
||||
{
|
||||
return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
|
||||
}
|
||||
|
||||
#endif /* PAGESTORE_CLIENT_H */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -425,12 +425,7 @@ impl CancelClosure {
|
||||
&mut mk_tls,
|
||||
&self.hostname,
|
||||
)
|
||||
.map_err(|e| {
|
||||
CancelError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
))
|
||||
})?;
|
||||
.map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?;
|
||||
|
||||
self.cancel_token.cancel_query_raw(socket, tls).await?;
|
||||
debug!("query was cancelled");
|
||||
|
||||
@@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
|
||||
fn helper_create_connect_info(
|
||||
mechanism: &TestConnectMechanism,
|
||||
) -> auth::Backend<'static, ComputeCredentials> {
|
||||
let user_info = auth::Backend::ControlPlane(
|
||||
auth::Backend::ControlPlane(
|
||||
MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
|
||||
ComputeCredentials {
|
||||
info: ComputeUserInfo {
|
||||
@@ -578,8 +578,7 @@ fn helper_create_connect_info(
|
||||
},
|
||||
keys: ComputeCredentialKeys::Password("password".into()),
|
||||
},
|
||||
);
|
||||
user_info
|
||||
)
|
||||
}
|
||||
|
||||
fn config() -> ComputeConfig {
|
||||
|
||||
@@ -47,6 +47,7 @@ impl ConnInfo {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub(crate) enum ClientDataEnum {
|
||||
Remote(ClientDataRemote),
|
||||
Local(ClientDataLocal),
|
||||
|
||||
@@ -226,6 +226,9 @@ struct Args {
|
||||
/// Path to the JWT auth token used to authenticate with other safekeepers.
|
||||
#[arg(long)]
|
||||
auth_token_path: Option<Utf8PathBuf>,
|
||||
|
||||
#[arg(long, help = "Run in development mode (disables security checks)")]
|
||||
dev: bool,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -343,6 +346,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
if !args.dev {
|
||||
let http_auth_enabled = args.http_auth_public_key_path.is_some();
|
||||
let pg_auth_enabled = args.pg_auth_public_key_path.is_some();
|
||||
let pg_tenant_only_auth_enabled = args.pg_tenant_only_auth_public_key_path.is_some();
|
||||
if !http_auth_enabled || !pg_auth_enabled || !pg_tenant_only_auth_enabled {
|
||||
bail!(
|
||||
"Safekeeper refuses to start with HTTP, PostgreSQL, or tenant-only PostgreSQL API authentication disabled.\n\
|
||||
Run with --dev to allow running without authentication.\n\
|
||||
This is insecure and should only be used in development environments."
|
||||
);
|
||||
}
|
||||
} else {
|
||||
warn!("Starting in dev mode: this may be an insecure configuration.");
|
||||
}
|
||||
|
||||
// Load JWT auth token to connect to other safekeepers for pull_timeline.
|
||||
// First check if the env var is present, then check the arg with the path.
|
||||
// We want to deprecate and remove the env var method in the future.
|
||||
|
||||
@@ -138,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> {
|
||||
/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
|
||||
/// case, SafeKeeper is not available (because WAL is not present on disk) and all
|
||||
/// operations can be done only with control file.
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub enum StateSK {
|
||||
Loaded(SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>),
|
||||
Offloaded(Box<TimelineState<control_file::FileStorage>>),
|
||||
|
||||
@@ -35,7 +35,7 @@ impl Manager {
|
||||
next_event: &Option<tokio::time::Instant>,
|
||||
state: &StateSnapshot,
|
||||
) -> bool {
|
||||
let ready = self.backup_task.is_none()
|
||||
self.backup_task.is_none()
|
||||
&& self.recovery_task.is_none()
|
||||
&& self.wal_removal_task.is_none()
|
||||
&& self.partial_backup_task.is_none()
|
||||
@@ -61,8 +61,7 @@ impl Manager {
|
||||
.unwrap()
|
||||
.flush_lsn
|
||||
.segment_number(self.wal_seg_size)
|
||||
== self.last_removed_segno + 1;
|
||||
ready
|
||||
== self.last_removed_segno + 1
|
||||
}
|
||||
|
||||
/// Evict the timeline to remote storage. Returns whether the eviction was successful.
|
||||
|
||||
@@ -96,6 +96,7 @@ enum Message {
|
||||
|
||||
impl Message {
|
||||
/// Convert proto message to internal message.
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
|
||||
match proto_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
|
||||
@@ -127,6 +128,7 @@ impl Message {
|
||||
}
|
||||
|
||||
/// Get the tenant_timeline_id from the message.
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
|
||||
match self {
|
||||
Message::SafekeeperTimelineInfo(msg) => Ok(msg
|
||||
@@ -185,6 +187,7 @@ enum SubscriptionKey {
|
||||
|
||||
impl SubscriptionKey {
|
||||
/// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
|
||||
match key {
|
||||
ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
|
||||
@@ -195,6 +198,7 @@ impl SubscriptionKey {
|
||||
}
|
||||
|
||||
/// Parse from FilterTenantTimelineId
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn from_proto_filter_tenant_timeline_id(
|
||||
opt: Option<&FilterTenantTimelineId>,
|
||||
) -> Result<Self, Status> {
|
||||
@@ -385,6 +389,7 @@ impl Registry {
|
||||
}
|
||||
|
||||
/// Send msg to relevant subscribers.
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
|
||||
PROCESSED_MESSAGES_TOTAL.inc();
|
||||
|
||||
@@ -436,6 +441,7 @@ struct Publisher {
|
||||
|
||||
impl Publisher {
|
||||
/// Send msg to relevant subscribers.
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
|
||||
self.registry.send_msg(msg)
|
||||
}
|
||||
|
||||
@@ -79,6 +79,7 @@ impl BrokerClientChannel {
|
||||
}
|
||||
|
||||
// parse variable length bytes from protobuf
|
||||
#[allow(clippy::result_large_err, reason = "TODO")]
|
||||
pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
|
||||
let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id)
|
||||
.map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;
|
||||
|
||||
@@ -800,7 +800,7 @@ impl ComputeHook {
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use super::*;
|
||||
@@ -808,6 +808,7 @@ pub(crate) mod tests {
|
||||
#[test]
|
||||
fn tenant_updates() -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let stripe_size = DEFAULT_STRIPE_SIZE;
|
||||
let mut tenant_state = ComputeHookTenant::new(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
@@ -848,7 +849,7 @@ pub(crate) mod tests {
|
||||
shard_count: ShardCount::new(2),
|
||||
shard_number: ShardNumber(1),
|
||||
},
|
||||
stripe_size: ShardStripeSize(32768),
|
||||
stripe_size,
|
||||
preferred_az: None,
|
||||
node_id: NodeId(1),
|
||||
});
|
||||
@@ -864,7 +865,7 @@ pub(crate) mod tests {
|
||||
shard_count: ShardCount::new(2),
|
||||
shard_number: ShardNumber(0),
|
||||
},
|
||||
stripe_size: ShardStripeSize(32768),
|
||||
stripe_size,
|
||||
preferred_az: None,
|
||||
node_id: NodeId(1),
|
||||
});
|
||||
@@ -874,7 +875,7 @@ pub(crate) mod tests {
|
||||
anyhow::bail!("Wrong send result");
|
||||
};
|
||||
assert_eq!(request.shards.len(), 2);
|
||||
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
|
||||
assert_eq!(request.stripe_size, Some(stripe_size));
|
||||
|
||||
// Simulate successful send
|
||||
*guard = Some(ComputeRemoteState {
|
||||
|
||||
@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Size of the in-memory map of pageserver_nodes
|
||||
pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
|
||||
|
||||
/// Count of how many pageserver nodes from in-memory map have https configured
|
||||
pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
|
||||
|
||||
/// Size of the in-memory map of safekeeper_nodes
|
||||
pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
|
||||
|
||||
/// Count of how many safekeeper nodes from in-memory map have https configured
|
||||
pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
|
||||
|
||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
@@ -89,6 +89,10 @@ impl Node {
|
||||
self.scheduling = scheduling
|
||||
}
|
||||
|
||||
pub(crate) fn has_https_port(&self) -> bool {
|
||||
self.listen_https_port.is_some()
|
||||
}
|
||||
|
||||
/// Does this registration request match `self`? This is used when deciding whether a registration
|
||||
/// request should be allowed to update an existing record with the same node ID.
|
||||
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
||||
|
||||
@@ -89,6 +89,9 @@ impl Safekeeper {
|
||||
pub(crate) fn availability(&self) -> SafekeeperState {
|
||||
self.availability.clone()
|
||||
}
|
||||
pub(crate) fn has_https_port(&self) -> bool {
|
||||
self.listen_https_port.is_some()
|
||||
}
|
||||
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn with_client_retries<T, O, F>(
|
||||
|
||||
@@ -43,7 +43,7 @@ use pageserver_api::models::{
|
||||
TimelineInfo, TopTenantShardItem, TopTenantShardsRequest,
|
||||
};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
|
||||
@@ -1509,6 +1509,10 @@ impl Service {
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(nodes.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_pageserver_nodes
|
||||
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||
|
||||
tracing::info!("Loading safekeepers from database...");
|
||||
let safekeepers = persistence
|
||||
@@ -1526,6 +1530,14 @@ impl Service {
|
||||
let safekeepers: HashMap<NodeId, Safekeeper> =
|
||||
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
|
||||
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_safekeeper_nodes
|
||||
.set(safekeepers.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_safekeeper_nodes
|
||||
.set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
|
||||
@@ -2742,7 +2754,7 @@ impl Service {
|
||||
count: tenant_shard_id.shard_count,
|
||||
// We only import un-sharded or single-sharded tenants, so stripe
|
||||
// size can be made up arbitrarily here.
|
||||
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
},
|
||||
placement_policy: Some(placement_policy),
|
||||
config: req.config.tenant_conf,
|
||||
@@ -6014,9 +6026,21 @@ impl Service {
|
||||
.max()
|
||||
.expect("We already validated >0 shards");
|
||||
|
||||
// FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
|
||||
// only work if they were using the default stripe size.
|
||||
let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
|
||||
// Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so
|
||||
// fall back to the original default stripe size of 32768 (256 MB) if it's not specified.
|
||||
const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768);
|
||||
let stripe_size = scan_result
|
||||
.shards
|
||||
.iter()
|
||||
.find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation)
|
||||
.expect("we validated >0 shards above")
|
||||
.stripe_size
|
||||
.unwrap_or_else(|| {
|
||||
if shard_count.count() > 1 {
|
||||
warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}");
|
||||
}
|
||||
ORIGINAL_STRIPE_SIZE
|
||||
});
|
||||
|
||||
let (response, waiters) = self
|
||||
.do_tenant_create(TenantCreateRequest {
|
||||
@@ -6242,6 +6266,10 @@ impl Service {
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(locked.nodes.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_pageserver_nodes
|
||||
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||
|
||||
locked.scheduler.node_remove(node_id);
|
||||
|
||||
@@ -6333,6 +6361,10 @@ impl Service {
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(nodes.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_pageserver_nodes
|
||||
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6557,6 +6589,10 @@ impl Service {
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_nodes
|
||||
.set(locked.nodes.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_pageserver_nodes
|
||||
.set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
|
||||
|
||||
match registration_status {
|
||||
RegistrationStatus::New => {
|
||||
@@ -7270,7 +7306,7 @@ impl Service {
|
||||
}
|
||||
|
||||
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
|
||||
// dirty, spawn another rone
|
||||
// dirty, spawn another one
|
||||
if self
|
||||
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
|
||||
.is_some()
|
||||
@@ -7829,7 +7865,7 @@ impl Service {
|
||||
// old, persisted stripe size.
|
||||
let new_stripe_size = match candidate.id.shard_count.count() {
|
||||
0 => panic!("invalid shard count 0"),
|
||||
1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
1 => Some(DEFAULT_STRIPE_SIZE),
|
||||
2.. => None,
|
||||
};
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::time::Duration;
|
||||
|
||||
use super::safekeeper_reconciler::ScheduleRequest;
|
||||
use crate::heartbeater::SafekeeperState;
|
||||
use crate::metrics;
|
||||
use crate::persistence::{
|
||||
DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
|
||||
};
|
||||
@@ -590,6 +591,20 @@ impl Service {
|
||||
}
|
||||
}
|
||||
locked.safekeepers = Arc::new(safekeepers);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_safekeeper_nodes
|
||||
.set(locked.safekeepers.len() as i64);
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_https_safekeeper_nodes
|
||||
.set(
|
||||
locked
|
||||
.safekeepers
|
||||
.values()
|
||||
.filter(|s| s.has_https_port())
|
||||
.count() as i64,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2000,7 +2000,7 @@ pub(crate) mod tests {
|
||||
use std::rc::Rc;
|
||||
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
|
||||
use rand::SeedableRng;
|
||||
use rand::rngs::StdRng;
|
||||
use utils::id::TenantId;
|
||||
@@ -2012,6 +2012,7 @@ pub(crate) mod tests {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_number = ShardNumber(0);
|
||||
let shard_count = ShardCount::new(1);
|
||||
let stripe_size = DEFAULT_STRIPE_SIZE;
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
@@ -2020,12 +2021,7 @@ pub(crate) mod tests {
|
||||
};
|
||||
TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
|
||||
policy,
|
||||
None,
|
||||
)
|
||||
@@ -2045,6 +2041,7 @@ pub(crate) mod tests {
|
||||
shard_count: ShardCount,
|
||||
preferred_az: Option<AvailabilityZone>,
|
||||
) -> Vec<TenantShard> {
|
||||
let stripe_size = DEFAULT_STRIPE_SIZE;
|
||||
(0..shard_count.count())
|
||||
.map(|i| {
|
||||
let shard_number = ShardNumber(i);
|
||||
@@ -2056,12 +2053,7 @@ pub(crate) mod tests {
|
||||
};
|
||||
TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
|
||||
policy.clone(),
|
||||
preferred_az.clone(),
|
||||
)
|
||||
|
||||
@@ -417,6 +417,19 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
cmd.append(f"--instance-id={instance_id}")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def object_storage_start(self, timeout_in_seconds: int | None = None):
|
||||
cmd = ["object-storage", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
cmd.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def object_storage_stop(self, immediate: bool):
|
||||
cmd = ["object-storage", "stop"]
|
||||
if immediate:
|
||||
cmd.extend(["-m", "immediate"])
|
||||
return self.raw_cli(cmd)
|
||||
pass
|
||||
|
||||
def pageserver_start(
|
||||
self,
|
||||
id: int,
|
||||
|
||||
@@ -1023,6 +1023,8 @@ class NeonEnvBuilder:
|
||||
|
||||
self.env.broker.assert_no_errors()
|
||||
|
||||
self.env.object_storage.assert_no_errors()
|
||||
|
||||
try:
|
||||
self.overlay_cleanup_teardown()
|
||||
except Exception as e:
|
||||
@@ -1118,6 +1120,8 @@ class NeonEnv:
|
||||
pagectl_env_vars["RUST_LOG"] = self.rust_log_override
|
||||
self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath)
|
||||
|
||||
self.object_storage = ObjectStorage(self)
|
||||
|
||||
# The URL for the pageserver to use as its control_plane_api config
|
||||
if config.storage_controller_port_override is not None:
|
||||
log.info(
|
||||
@@ -1173,6 +1177,7 @@ class NeonEnv:
|
||||
},
|
||||
"safekeepers": [],
|
||||
"pageservers": [],
|
||||
"object_storage": {"port": self.port_distributor.get_port()},
|
||||
"generate_local_ssl_certs": self.generate_local_ssl_certs,
|
||||
}
|
||||
|
||||
@@ -1408,6 +1413,8 @@ class NeonEnv:
|
||||
self.storage_controller.on_safekeeper_deploy(sk_id, body)
|
||||
self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
|
||||
|
||||
self.object_storage.start(timeout_in_seconds=timeout_in_seconds)
|
||||
|
||||
def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
|
||||
"""
|
||||
After this method returns, there should be no child processes running.
|
||||
@@ -1425,6 +1432,8 @@ class NeonEnv:
|
||||
except Exception as e:
|
||||
raise_later = e
|
||||
|
||||
self.object_storage.stop(immediate=immediate)
|
||||
|
||||
# Stop storage controller before pageservers: we don't want it to spuriously
|
||||
# detect a pageserver "failure" during test teardown
|
||||
self.storage_controller.stop(immediate=immediate)
|
||||
@@ -2635,6 +2644,26 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.stop(immediate=True)
|
||||
|
||||
|
||||
class ObjectStorage(LogUtils):
|
||||
def __init__(self, env: NeonEnv):
|
||||
service_dir = env.repo_dir / "object_storage"
|
||||
super().__init__(logfile=service_dir / "object_storage.log")
|
||||
self.conf_path = service_dir / "object_storage.json"
|
||||
self.env = env
|
||||
|
||||
def base_url(self):
|
||||
return json.loads(self.conf_path.read_text())["listen"]
|
||||
|
||||
def start(self, timeout_in_seconds: int | None = None):
|
||||
self.env.neon_cli.object_storage_start(timeout_in_seconds)
|
||||
|
||||
def stop(self, immediate: bool = False):
|
||||
self.env.neon_cli.object_storage_stop(immediate)
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert_no_errors(self.logfile, "object_storage", [])
|
||||
|
||||
|
||||
class NeonProxiedStorageController(NeonStorageController):
|
||||
def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool):
|
||||
super().__init__(env, proxy_port, auth_enabled, use_https)
|
||||
|
||||
@@ -126,6 +126,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
# Tests run in dev mode
|
||||
".*Starting in dev mode.*",
|
||||
".*Starting in dev mode - authentication security checks are disabled.*",
|
||||
".*Starting in dev mode: this may be an insecure configuration.*",
|
||||
# Tests that stop endpoints & use the storage controller's neon_local notification
|
||||
# mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
|
||||
# controller's attempts to notify the endpoint).
|
||||
|
||||
@@ -105,7 +105,7 @@ def parse_layer_file_name(file_name: str) -> LayerName:
|
||||
except InvalidFileName:
|
||||
pass
|
||||
|
||||
raise InvalidFileName("neither image nor delta layer")
|
||||
raise InvalidFileName(f"neither image nor delta layer: {file_name}")
|
||||
|
||||
|
||||
def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
|
||||
|
||||
@@ -40,6 +40,8 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
|
||||
for layer in info.historic_layers:
|
||||
assert not layer.remote
|
||||
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=60)
|
||||
|
||||
log.info("ready")
|
||||
|
||||
|
||||
|
||||
@@ -145,11 +145,14 @@ def run_database_maintenance(env: PgCompare):
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction")
|
||||
with env.zenbenchmark.record_duration("reindex concurrently"):
|
||||
cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;")
|
||||
log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction")
|
||||
# in production a customer would likely use reindex concurrently
|
||||
# but for our test we don't care about the downtime
|
||||
# and it would just about double the time we report in the test
|
||||
# because we need one more table scan for each index
|
||||
log.info("start REINDEX TABLE transaction.transaction")
|
||||
with env.zenbenchmark.record_duration("reindex"):
|
||||
cur.execute("REINDEX TABLE transaction.transaction;")
|
||||
log.info("finished REINDEX TABLE transaction.transaction")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
|
||||
|
||||
@@ -808,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.38.0"
|
||||
version = "1.38.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
|
||||
checksum = "68722da18b0fc4a05fdc1120b302b82051265792a1e1b399086e9b204b10ad3d"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
|
||||
@@ -38,12 +38,34 @@ PREEMPT_COMPACTION_TENANT_CONF = {
|
||||
"compaction_target_size": 1024**2,
|
||||
"image_creation_threshold": 1,
|
||||
"image_creation_preempt_threshold": 1,
|
||||
# compact more frequently
|
||||
# Compact more frequently
|
||||
"compaction_threshold": 3,
|
||||
"compaction_upper_limit": 6,
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
|
||||
PREEMPT_GC_COMPACTION_TENANT_CONF = {
|
||||
"gc_period": "5s",
|
||||
"compaction_period": "5s",
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024**2,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024**2,
|
||||
"image_creation_threshold": 10000, # Do not create image layers at all
|
||||
"image_creation_preempt_threshold": 10000,
|
||||
# Compact more frequently
|
||||
"compaction_threshold": 3,
|
||||
"compaction_upper_limit": 6,
|
||||
"lsn_lease_length": "0s",
|
||||
# Enable gc-compaction
|
||||
"gc_compaction_enabled": "true",
|
||||
"gc_compaction_initial_threshold_kb": 1024, # At a small threshold
|
||||
"gc_compaction_ratio_percent": 1,
|
||||
# No PiTR interval and small GC horizon
|
||||
"pitr_interval": "0s",
|
||||
"gc_horizon": f"{1024**2}",
|
||||
}
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
@@ -165,6 +187,41 @@ def test_pageserver_compaction_preempt(
|
||||
env.pageserver.assert_log_contains("resuming image layer creation")
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_gc_compaction_preempt(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
# Ideally we should be able to do unit tests for this, but we need real Postgres
|
||||
# WALs in order to do unit testing...
|
||||
|
||||
conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
row_count = 200000
|
||||
churn_rounds = 10
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageserver.id)
|
||||
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
workload.churn_rows(row_count, env.pageserver.id, upload=False)
|
||||
workload.validate(env.pageserver.id)
|
||||
ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
# ensure gc_compaction gets preempted and then resumed
|
||||
env.pageserver.assert_log_contains("preempt gc-compaction")
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -148,9 +148,9 @@ def test_create_snapshot(
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
# Miniature layers to enable generating non-trivial layer map without writing lots of data.
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
"checkpoint_distance": f"{256 * 1024}",
|
||||
"compaction_threshold": "5",
|
||||
"compaction_target_size": f"{256 * 1024}",
|
||||
}
|
||||
)
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
@@ -492,6 +492,13 @@ HISTORIC_DATA_SETS = [
|
||||
PgVersion.V17,
|
||||
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
|
||||
),
|
||||
# Tenant manifest v1.
|
||||
HistoricDataSet(
|
||||
"2025-04-08-tenant-manifest-v1",
|
||||
TenantId("c547c28588abf1d7b7139ff1f1158345"),
|
||||
PgVersion.V17,
|
||||
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -276,3 +276,34 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
||||
if i > 1:
|
||||
before_timestamp = tbl[i - step_size][1]
|
||||
assert timestamp >= before_timestamp, "before_timestamp before timestamp"
|
||||
|
||||
|
||||
def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that getting the timestamp of the head LSN of a newly created branch works.
|
||||
This verifies that we don't get a 404 error when trying to get the timestamp
|
||||
of the head LSN of a branch that was just created.
|
||||
We now return a special status code 412 to indicate if there is no timestamp found for lsn.
|
||||
|
||||
Reproducer for https://github.com/neondatabase/neon/issues/11439
|
||||
"""
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Create a new branch
|
||||
new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch")
|
||||
|
||||
# Retrieve the commit LSN of the empty branch, which we have never run postgres on
|
||||
detail = env.pageserver.http_client().timeline_detail(
|
||||
tenant_id=env.initial_tenant, timeline_id=new_timeline_id
|
||||
)
|
||||
head_lsn = detail["last_record_lsn"]
|
||||
|
||||
# Verify that we get 412 status code
|
||||
with env.pageserver.http_client() as client:
|
||||
with pytest.raises(PageserverApiException) as err:
|
||||
client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
head_lsn,
|
||||
)
|
||||
assert err.value.status_code == 412
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user