Compare commits

..

1 Commits

Author SHA1 Message Date
Heikki Linnakangas
791f181034 WIP: Add pg_tracing extension 2025-04-07 19:53:52 +03:00
190 changed files with 4702 additions and 9997 deletions

View File

@@ -19,7 +19,6 @@
!pageserver/
!pgxn/
!proxy/
!object_storage/
!storage_scrubber/
!safekeeper/
!storage_broker/

View File

@@ -6,7 +6,6 @@ self-hosted-runner:
- small
- small-metal
- small-arm64
- unit-perf
- us-east-2
config-variables:
- AWS_ECR_REGION

View File

@@ -70,7 +70,6 @@ runs:
- name: Install Allure
shell: bash -euxo pipefail {0}
working-directory: /tmp
run: |
if ! which allure; then
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip

View File

@@ -53,13 +53,10 @@ jobs:
|| inputs.component-name == 'Compute' && 'release-compute'
}}
run: |
now_date=$(date -u +'%Y-%m-%d')
now_time=$(date -u +'%H-%M-%Z')
{
echo "title=${COMPONENT_NAME} release ${now_date}"
echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
echo "release-branch=${RELEASE_BRANCH}"
} | tee -a ${GITHUB_OUTPUT}
today=$(date +'%Y-%m-%d')
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
- name: Configure git
run: |

View File

@@ -284,7 +284,7 @@ jobs:
statuses: write
contents: write
pull-requests: write
runs-on: [ self-hosted, unit-perf ]
runs-on: [ self-hosted, small-metal ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
@@ -1271,7 +1271,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:

1
.gitignore vendored
View File

@@ -1,4 +1,3 @@
/artifact_cache
/pg_install
/target
/tmp_check

68
Cargo.lock generated
View File

@@ -2837,7 +2837,6 @@ dependencies = [
"utils",
"uuid",
"workspace_hack",
"x509-cert",
]
[[package]]
@@ -3992,33 +3991,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "object_storage"
version = "0.0.1"
dependencies = [
"anyhow",
"axum",
"axum-extra",
"camino",
"camino-tempfile",
"futures",
"http-body-util",
"itertools 0.10.5",
"jsonwebtoken",
"prometheus",
"rand 0.8.5",
"remote_storage",
"serde",
"serde_json",
"test-log",
"tokio",
"tokio-util",
"tower 0.5.2",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "once_cell"
version = "1.20.2"
@@ -4721,7 +4693,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.6"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
dependencies = [
"base64 0.22.1",
"byteorder",
@@ -4755,7 +4727,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.6"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
dependencies = [
"bytes",
"chrono",
@@ -5495,16 +5467,6 @@ version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c707298afce11da2efef2f600116fa93ffa7a032b5d7b628aa17711ec81383ca"
[[package]]
name = "remote_keys"
version = "0.1.0"
dependencies = [
"anyhow",
"rand 0.8.5",
"utils",
"workspace_hack",
]
[[package]]
name = "remote_storage"
version = "0.1.0"
@@ -5520,7 +5482,6 @@ dependencies = [
"azure_identity",
"azure_storage",
"azure_storage_blobs",
"base64 0.13.1",
"bytes",
"camino",
"camino-tempfile",
@@ -5531,7 +5492,6 @@ dependencies = [
"humantime-serde",
"hyper 1.4.1",
"itertools 0.10.5",
"md5",
"metrics",
"once_cell",
"pin-project-lite",
@@ -6965,28 +6925,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "test-log"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
dependencies = [
"env_logger",
"test-log-macros",
"tracing-subscriber",
]
[[package]]
name = "test-log-macros"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "thiserror"
version = "1.0.69"
@@ -7234,7 +7172,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.10"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
dependencies = [
"async-trait",
"byteorder",

View File

@@ -30,7 +30,6 @@ members = [
"libs/tenant_size_model",
"libs/metrics",
"libs/postgres_connection",
"libs/remote_keys",
"libs/remote_storage",
"libs/tracing-utils",
"libs/postgres_ffi/wal_craft",
@@ -41,7 +40,6 @@ members = [
"libs/proxy/postgres-protocol2",
"libs/proxy/postgres-types2",
"libs/proxy/tokio-postgres2",
"object_storage",
]
[workspace.package]
@@ -210,7 +208,6 @@ tracing-opentelemetry = "0.28"
tracing-serde = "0.2.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
test-log = { version = "0.2.17", default-features = false, features = ["log"] }
twox-hash = { version = "1.6.3", default-features = false }
typed-json = "0.1"
url = "2.2"

View File

@@ -89,7 +89,6 @@ RUN set -e \
--bin storage_broker \
--bin storage_controller \
--bin proxy \
--bin object_storage \
--bin neon_local \
--bin storage_scrubber \
--locked --release
@@ -122,7 +121,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin

View File

@@ -1388,6 +1388,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
# Layer "pg_tracing"
# compile pg_tracing extension
#
#########################################################################################
FROM build-deps AS pg_tracing-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
"v14" | "v15") \
echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
;; \
*) \
;; \
esac && \
wget https://github.com/DataDog/pg_tracing/archive/refs/tags/v0.1.3.tar.gz -O pg_tracing.tar.gz && \
echo "d0a7cca7279bb29601ba6c4c1aaeb3a44d71e6afa3b78aae1e3b7269e688f907 pg_tracing.tar.gz" | sha256sum --check && \
mkdir pg_tracing-src && cd pg_tracing-src && tar xzf ../pg_tracing.tar.gz --strip-components=1 -C .
FROM pg-build AS pg_tracing-build
COPY --from=pg_tracing-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_tracing-src
RUN case "${PG_VERSION:?}" in \
"v14" | "v15") \
echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
;; \
*) \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pg_mooncake"
@@ -1617,6 +1649,7 @@ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_tracing-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1793,6 +1826,7 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
#COPY --from=wal2json-src /ext-src/ /ext-src/
COPY --from=pg_ivm-src /ext-src/ /ext-src/
COPY --from=pg_partman-src /ext-src/ /ext-src/
COPY --from=pg_tracing-src /ext-src/ /ext-src/
#COPY --from=pg_mooncake-src /ext-src/ /ext-src/
COPY --from=pg_repack-src /ext-src/ /ext-src/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/

View File

@@ -29,12 +29,13 @@
//! ```sh
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -c /var/db/postgres/configs/config.json \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::mpsc;
use std::thread;
@@ -42,7 +43,8 @@ use std::time::Duration;
use anyhow::{Context, Result};
use clap::Parser;
use compute_api::responses::ComputeConfig;
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
};
@@ -116,21 +118,16 @@ struct Cli {
#[arg(long)]
pub set_disk_quota_for_fs: Option<String>,
// TODO(tristan957): remove alias after compatibility tests are no longer
// an issue
#[arg(short = 'c', long, alias = "spec-path")]
pub config: Option<OsString>,
#[arg(short = 's', long = "spec", group = "spec")]
pub spec_json: Option<String>,
#[arg(short = 'S', long, group = "spec-path")]
pub spec_path: Option<OsString>,
#[arg(short = 'i', long, group = "compute-id")]
pub compute_id: String,
#[arg(
short = 'p',
long,
conflicts_with = "config",
value_name = "CONTROL_PLANE_API_BASE_URL",
requires = "compute-id"
)]
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
pub control_plane_uri: Option<String>,
}
@@ -139,7 +136,7 @@ fn main() -> Result<()> {
let scenario = failpoint_support::init();
// For historical reasons, the main thread that processes the config and launches postgres
// For historical reasons, the main thread that processes the spec and launches postgres
// is synchronous, but we always have this tokio runtime available and we "enter" it so
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
// from all parts of compute_ctl.
@@ -155,7 +152,7 @@ fn main() -> Result<()> {
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let config = get_config(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
@@ -175,8 +172,10 @@ fn main() -> Result<()> {
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
live_config_allowed: cli_spec.live_config_allowed,
},
config,
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
let exit_code = compute_node.run()?;
@@ -201,17 +200,37 @@ async fn init() -> Result<()> {
Ok(())
}
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
// First, read the config from the path if provided
if let Some(ref config) = cli.config {
let file = File::open(config)?;
return Ok(serde_json::from_reader(&file)?);
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, try to get cluster spec from the cli argument
if let Some(ref spec_json) = cli.spec_json {
info!("got spec from cli argument {}", spec_json);
return Ok(CliSpecParams {
spec: Some(serde_json::from_str(spec_json)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: false,
});
}
// If the config wasn't provided in the CLI arguments, then retrieve it from
// the control plane
match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(config) => Ok(config),
// Second, try to read it from the file if path is provided
if let Some(ref spec_path) = cli.spec_path {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: true,
});
}
if cli.control_plane_uri.is_none() {
panic!("must specify --control-plane-uri");
};
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(resp) => Ok(CliSpecParams {
spec: resp.0,
compute_ctl_config: resp.1,
live_config_allowed: true,
}),
Err(e) => {
error!(
"cannot get response from control plane: {}\n\
@@ -223,6 +242,14 @@ fn get_config(cli: &Cli) -> Result<ComputeConfig> {
}
}
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
#[allow(dead_code)]
compute_ctl_config: ComputeCtlConfig,
live_config_allowed: bool,
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may

View File

@@ -98,15 +98,13 @@ pub async fn get_database_schema(
.kill_on_drop(true)
.spawn()?;
let stdout = cmd
.stdout
.take()
.ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;
let stdout = cmd.stdout.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
})?;
let stderr = cmd
.stderr
.take()
.ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;
let stderr = cmd.stderr.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
})?;
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
let stderr_reader = BufReader::new(stderr);
@@ -130,7 +128,8 @@ pub async fn get_database_schema(
}
});
return Err(SchemaDumpError::IO(std::io::Error::other(
return Err(SchemaDumpError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
"failed to start pg_dump",
)));
}

View File

@@ -11,7 +11,7 @@ use std::{env, fs};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use compute_api::privilege::Privilege;
use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
};
@@ -93,6 +93,20 @@ pub struct ComputeNodeParams {
/// the address of extension storage proxy gateway
pub ext_remote_storage: Option<String>,
/// We should only allow live re- / configuration of the compute node if
/// it uses 'pull model', i.e. it can go to control-plane and fetch
/// the latest configuration. Otherwise, there could be a case:
/// - we start compute with some spec provided as argument
/// - we push new spec and it does reconfiguration
/// - but then something happens and compute pod / VM is destroyed,
/// so k8s controller starts it again with the **old** spec
///
/// and the same for empty computes:
/// - we started compute without any spec
/// - we push spec and it does configuration
/// - but then it is restarted without any spec again
pub live_config_allowed: bool,
}
/// Compute node info shared across several `compute_ctl` threads.
@@ -303,7 +317,11 @@ struct StartVmMonitorResult {
}
impl ComputeNode {
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
pub fn new(
params: ComputeNodeParams,
cli_spec: Option<ComputeSpec>,
compute_ctl_config: ComputeCtlConfig,
) -> Result<Self> {
let connstr = params.connstr.as_str();
let conn_conf = postgres::config::Config::from_str(connstr)
.context("cannot build postgres config from connstr")?;
@@ -311,8 +329,8 @@ impl ComputeNode {
.context("cannot build tokio postgres config from connstr")?;
let mut new_state = ComputeState::new();
if let Some(spec) = config.spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
if let Some(cli_spec) = cli_spec {
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
}
@@ -323,7 +341,7 @@ impl ComputeNode {
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_download_progress: RwLock::new(HashMap::new()),
compute_ctl_config: config.compute_ctl_config,
compute_ctl_config,
})
}
@@ -519,14 +537,11 @@ impl ComputeNode {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id,
pspec.timeline_id,
pspec.spec.project_id.as_deref().unwrap_or("None"),
pspec.spec.branch_id.as_deref().unwrap_or("None"),
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
pspec.spec.features,
pspec.spec.remote_extensions,
);
@@ -630,28 +645,31 @@ impl ComputeNode {
});
}
// Configure and start rsyslog for compliance audit logging
match pspec.spec.audit_log_level {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
let remote_endpoint =
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
// Configure and start rsyslog for HIPAA if necessary
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
_ => {}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
// Configure and start rsyslog for Postgres logs export
let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
configure_postgres_logs_export(conf)?;
if self.has_feature(ComputeFeature::PostgresLogsExport) {
if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
let host = PostgresLogsRsyslogConfig::default_host(project_id);
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
configure_postgres_logs_export(conf)?;
} else {
warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
}
}
// Launch remaining service threads
let _monitor_handle = launch_monitor(self);
@@ -1555,10 +1573,6 @@ impl ComputeNode {
});
}
// Reconfigure rsyslog for Postgres logs export
let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
configure_postgres_logs_export(conf)?;
// Write new config
let pgdata_path = Path::new(&self.params.pgdata);
config::write_postgres_conf(

View File

@@ -7,7 +7,7 @@ use std::io::prelude::*;
use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
@@ -89,15 +89,6 @@ pub fn write_postgres_conf(
escape_conf_value(&s.to_string())
)?;
}
if let Some(s) = &spec.project_id {
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.branch_id {
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.endpoint_id {
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
}
// tls
if let Some(tls_config) = tls_config {
@@ -178,7 +169,7 @@ pub fn write_postgres_conf(
// and don't allow the user or the control plane admin to change them.
match spec.audit_log_level {
ComputeAudit::Disabled => {}
ComputeAudit::Log | ComputeAudit::Base => {
ComputeAudit::Log => {
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
writeln!(file, "pgaudit.log='ddl,role'")?;
// Disable logging of catalog queries to reduce the noise
@@ -202,20 +193,16 @@ pub fn write_postgres_conf(
}
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
}
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
ComputeAudit::Hipaa => {
writeln!(
file,
"# Managed by compute_ctl compliance audit settings: begin"
)?;
// Enable logging of parameters.
// This is very verbose and may contain sensitive data.
if spec.audit_log_level == ComputeAudit::Full {
writeln!(file, "pgaudit.log_parameter=on")?;
writeln!(file, "pgaudit.log='all'")?;
} else {
writeln!(file, "pgaudit.log_parameter=off")?;
writeln!(file, "pgaudit.log='all, -misc'")?;
}
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;
@@ -268,7 +255,7 @@ pub fn write_postgres_conf(
// We need Postgres to send logs to rsyslog so that we can forward them
// further to customers' log aggregation systems.
if spec.logs_export_host.is_some() {
if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
writeln!(file, "log_destination='stderr,syslog'")?;
}

View File

@@ -6,15 +6,20 @@ use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
};
use compute_api::requests::ComputeClaims;
use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use serde::Deserialize;
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::{debug, warn};
use tracing::warn;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug, Deserialize)]
pub(in crate::http) struct Claims {
compute_id: String,
}
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
compute_id: String,
@@ -92,7 +97,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid compute ID in authorization token claims",
"invalid claims in authorization token",
));
}
@@ -107,19 +112,13 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
impl Authorize {
/// Verify the token using the JSON Web Key set and return the token data.
fn verify(
jwks: &JwkSet,
token: &str,
validation: &Validation,
) -> Result<TokenData<ComputeClaims>> {
debug!("verifying token {}", token);
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"failed to construct decoding key from {}: {}",
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
@@ -128,11 +127,11 @@ impl Authorize {
}
};
match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"failed to decode authorization token using {}: {}",
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
@@ -142,6 +141,6 @@ impl Authorize {
}
}
Err(anyhow!("failed to verify authorization token"))
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -306,6 +306,36 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/configure_telemetry:
post:
tags:
- Configure
summary: Configure rsyslog
description: |
This API endpoint configures rsyslog to forward Postgres logs
to a specified otel collector.
operationId: configureTelemetry
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
logs_export_host:
type: string
description: |
Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
responses:
204:
description: "Telemetry configured successfully"
500:
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -1,9 +1,11 @@
use std::sync::Arc;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::ConfigurationRequest;
use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::spec::ComputeFeature;
use http::StatusCode;
use tokio::task;
use tracing::info;
@@ -11,6 +13,7 @@ use tracing::info;
use crate::compute::{ComputeNode, ParsedSpec};
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
@@ -22,6 +25,13 @@ pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.params.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),
);
}
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
Ok(p) => p,
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
@@ -85,3 +95,25 @@ pub(in crate::http) async fn configure(
JsonResponse::success(StatusCode::OK, body)
}
pub(in crate::http) async fn configure_telemetry(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigureTelemetryRequest>,
) -> Response {
if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"Postgres logs export feature is not enabled".to_string(),
);
}
let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
if let Err(err) = configure_postgres_logs_export(conf) {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
}
Response::builder()
.status(StatusCode::NO_CONTENT)
.body(Body::from(""))
.unwrap()
}

View File

@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/configure_telemetry", post(configure::configure_telemetry))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))

View File

@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
// And it's fair to call it a 'RPC' (Remote Procedure Call).
pub enum CPlaneRequestRPC {
GetConfig,
GetSpec,
}
impl CPlaneRequestRPC {
pub fn as_str(&self) -> &str {
match self {
CPlaneRequestRPC::GetConfig => "GetConfig",
CPlaneRequestRPC::GetSpec => "GetSpec",
}
}
}

View File

@@ -119,9 +119,16 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
};
Ok(config_content)
}
/// Returns the default host for otel collector that receives Postgres logs
pub fn default_host(project_id: &str) -> String {
format!(
"config-{}-collector.neon-telemetry.svc.cluster.local:10514",
project_id
)
}
}
/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
let new_config = conf.build()?;
let current_config = PostgresLogsRsyslogConfig::current_config()?;
@@ -254,5 +261,16 @@ mod tests {
let res = conf.build();
assert!(res.is_err());
}
{
// Verify config with default host
let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert!(conf_str.contains(r#"shy-breeze-123"#));
assert!(conf_str.contains(r#"port="10514""#));
}
}
}

View File

@@ -3,8 +3,9 @@ use std::path::Path;
use anyhow::{Result, anyhow, bail};
use compute_api::responses::{
ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::spec::ComputeSpec;
use reqwest::StatusCode;
use tokio_postgres::Client;
use tracing::{error, info, instrument};
@@ -20,7 +21,7 @@ use crate::params::PG_HBA_ALL_MD5;
fn do_control_plane_request(
uri: &str,
jwt: &str,
) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
let resp = reqwest::blocking::Client::new()
.get(uri)
.header("Authorization", format!("Bearer {}", jwt))
@@ -28,14 +29,14 @@ fn do_control_plane_request(
.map_err(|e| {
(
true,
format!("could not perform request to control plane: {:?}", e),
format!("could not perform spec request to control plane: {:?}", e),
UNKNOWN_HTTP_STATUS.to_string(),
)
})?;
let status = resp.status();
match status {
StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
Ok(spec_resp) => Ok(spec_resp),
Err(e) => Err((
true,
@@ -68,35 +69,40 @@ fn do_control_plane_request(
}
}
/// Request config from the control-plane by compute_id. If
/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
/// authorization.
pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
/// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
info!("getting config from control plane: {}", cp_uri);
info!("getting spec from control plane: {}", cp_uri);
// Do 3 attempts to get spec from the control plane using the following logic:
// - network error -> then retry
// - compute id is unknown or any other error -> bail out
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got config -> return Ok(Some(config))
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
let result = match do_control_plane_request(&cp_uri, &jwt) {
Ok(config_resp) => {
Ok(spec_resp) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[
CPlaneRequestRPC::GetConfig.as_str(),
CPlaneRequestRPC::GetSpec.as_str(),
&StatusCode::OK.to_string(),
])
.inc();
match config_resp.status {
ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
ControlPlaneComputeStatus::Attached => {
if config_resp.spec.is_some() {
Ok(config_resp.into())
if let Some(spec) = spec_resp.spec {
Ok((Some(spec), spec_resp.compute_ctl_config))
} else {
bail!("compute is attached, but spec is empty")
}
@@ -105,7 +111,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
}
Err((retry, msg, status)) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
.inc();
if retry {
Err(anyhow!(msg))
@@ -116,7 +122,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
};
if let Err(e) = &result {
error!("attempt {} to get config failed with: {}", attempt, e);
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return result;
}
@@ -127,13 +133,13 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
// All attempts failed, return error.
Err(anyhow::anyhow!(
"Exhausted all attempts to retrieve the config from the control plane"
"Exhausted all attempts to retrieve the spec from the control plane"
))
}
/// Check `pg_hba.conf` and update if needed to allow external connections.
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of config.json
// XXX: consider making it a part of spec.json
let pghba_path = pgdata_path.join("pg_hba.conf");
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -147,7 +153,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
/// Create a standby.signal file
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of config.json
// XXX: consider making it a part of spec.json
let signalfile = pgdata_path.join("standby.signal");
if !signalfile.exists() {

View File

@@ -278,12 +278,12 @@ impl ComputeNode {
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
ComputeAudit::Hipaa => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log | ComputeAudit::Base => {
ComputeAudit::Log => {
phases.push(CreatePgauditExtension);
phases.push(DisablePostgresDBPgAudit);
}

View File

@@ -20,10 +20,8 @@ use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
ObjectStorageConf, SafekeeperConf,
SafekeeperConf,
};
use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
use control_plane::object_storage::ObjectStorage;
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::{
@@ -41,7 +39,7 @@ use pageserver_api::controller_api::{
use pageserver_api::models::{
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
@@ -93,8 +91,6 @@ enum NeonLocalCmd {
#[command(subcommand)]
Safekeeper(SafekeeperCmd),
#[command(subcommand)]
ObjectStorage(ObjectStorageCmd),
#[command(subcommand)]
Endpoint(EndpointCmd),
#[command(subcommand)]
Mappings(MappingsCmd),
@@ -458,32 +454,6 @@ enum SafekeeperCmd {
Restart(SafekeeperRestartCmdArgs),
}
#[derive(clap::Subcommand)]
#[clap(about = "Manage object storage")]
enum ObjectStorageCmd {
Start(ObjectStorageStartCmd),
Stop(ObjectStorageStopCmd),
}
#[derive(clap::Args)]
#[clap(about = "Start object storage")]
struct ObjectStorageStartCmd {
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
#[derive(clap::Args)]
#[clap(about = "Stop object storage")]
struct ObjectStorageStopCmd {
#[arg(value_enum, default_value = "fast")]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
stop_mode: StopMode,
}
#[derive(clap::Args)]
#[clap(about = "Start local safekeeper")]
struct SafekeeperStartCmdArgs {
@@ -789,7 +759,6 @@ fn main() -> Result<()> {
}
NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
};
@@ -1006,9 +975,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
}
})
.collect(),
object_storage: ObjectStorageConf {
port: OBJECT_STORAGE_DEFAULT_PORT,
},
pg_distrib_dir: None,
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
@@ -1117,7 +1083,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
stripe_size: args
.shard_stripe_size
.map(ShardStripeSize)
.unwrap_or(DEFAULT_STRIPE_SIZE),
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
},
placement_policy: args.placement_policy.clone(),
config: tenant_conf,
@@ -1430,7 +1396,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
vec![(parsed.0, parsed.1.unwrap_or(5432))],
// If caller is telling us what pageserver to use, this is not a tenant which is
// full managed by storage controller, therefore not sharded.
DEFAULT_STRIPE_SIZE,
ShardParameters::DEFAULT_STRIPE_SIZE,
)
} else {
// Look up the currently attached location of the tenant, and its striping metadata,
@@ -1717,41 +1683,6 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
use ObjectStorageCmd::*;
let storage = ObjectStorage::from_env(env);
// In tests like test_forward_compatibility or test_graceful_cluster_restart
// old neon binaries (without object_storage) are present
if !storage.bin.exists() {
eprintln!(
"{} binary not found. Ignore if this is a compatibility test",
storage.bin
);
return Ok(());
}
match subcmd {
Start(ObjectStorageStartCmd { start_timeout }) => {
if let Err(e) = storage.start(start_timeout).await {
eprintln!("object_storage start failed: {e}");
exit(1);
}
}
Stop(ObjectStorageStopCmd { stop_mode }) => {
let immediate = match stop_mode {
StopMode::Fast => false,
StopMode::Immediate => true,
};
if let Err(e) = storage.stop(immediate) {
eprintln!("proxy stop failed: {e}");
exit(1);
}
}
};
Ok(())
}
async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
match subcmd {
StorageBrokerCmd::Start(args) => {
@@ -1846,13 +1777,6 @@ async fn handle_start_all_impl(
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
});
}
js.spawn(async move {
ObjectStorage::from_env(env)
.start(&retry_timeout)
.await
.map_err(|e| e.context("start object_storage"))
});
})();
let mut errors = Vec::new();
@@ -1950,11 +1874,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
}
}
let storage = ObjectStorage::from_env(env);
if let Err(e) = storage.stop(immediate) {
eprintln!("object_storage stop failed: {:#}", e);
}
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.stop(immediate) {

View File

@@ -29,7 +29,7 @@
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
//! postgresql.conf - postgresql settings
//! config.json - passed to `compute_ctl`
//! spec.json - passed to `compute_ctl`
//! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal
@@ -46,9 +46,7 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, anyhow, bail};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
};
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
@@ -621,101 +619,86 @@ impl Endpoint {
remote_extensions = None;
};
// Create config file
let config = {
let mut spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
disk_quota_bytes: None,
disable_lfc_resizing: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
}]
} else {
Vec::new()
},
settings: None,
postgresql_conf: Some(postgresql_conf.clone()),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
project_id: None,
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
logs_export_host: None::<String>,
};
// this strange code is needed to support respec() in tests
if self.cluster.is_some() {
debug!("Cluster is already set in the endpoint spec, using it");
spec.cluster = self.cluster.clone().unwrap();
debug!("spec.cluster {:?}", spec.cluster);
// fill missing fields again
if create_test_user {
spec.cluster.roles.push(Role {
// Create spec file
let mut spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
disk_quota_bytes: None,
disable_lfc_resizing: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
});
spec.cluster.databases.push(Database {
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
});
}
spec.cluster.postgresql_conf = Some(postgresql_conf);
}
ComputeConfig {
spec: Some(spec),
compute_ctl_config: ComputeCtlConfig::default(),
}
}]
} else {
Vec::new()
},
settings: None,
postgresql_conf: Some(postgresql_conf.clone()),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
};
// TODO(tristan957): Remove the write to spec.json after compatibility
// tests work themselves out
// this strange code is needed to support respec() in tests
if self.cluster.is_some() {
debug!("Cluster is already set in the endpoint spec, using it");
spec.cluster = self.cluster.clone().unwrap();
debug!("spec.cluster {:?}", spec.cluster);
// fill missing fields again
if create_test_user {
spec.cluster.roles.push(Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
});
spec.cluster.databases.push(Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
});
}
spec.cluster.postgresql_conf = Some(postgresql_conf);
}
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
let config_path = self.endpoint_path().join("config.json");
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
let logfile = std::fs::OpenOptions::new()
@@ -723,16 +706,6 @@ impl Endpoint {
.append(true)
.open(self.endpoint_path().join("compute.log"))?;
// TODO(tristan957): Remove when compatibility tests are no longer an
// issue
let old_compute_ctl = {
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
let help_output = cmd.arg("--help").output()?;
let help_output = String::from_utf8_lossy(&help_output.stdout);
!help_output.contains("--config")
};
// Launch compute_ctl
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
@@ -751,18 +724,9 @@ impl Endpoint {
])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
// TODO(tristan957): Change this to --config when compatibility tests
// are no longer an issue
.args([
"--spec-path",
self.endpoint_path()
.join(if old_compute_ctl {
"spec.json"
} else {
"config.json"
})
.to_str()
.unwrap(),
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
@@ -905,12 +869,10 @@ impl Endpoint {
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
) -> Result<()> {
let (mut spec, compute_ctl_config) = {
let config_path = self.endpoint_path().join("config.json");
let file = std::fs::File::open(config_path)?;
let config: ComputeConfig = serde_json::from_reader(file)?;
(config.spec.unwrap(), config.compute_ctl_config)
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
serde_json::from_reader(file)?
};
let postgresql_conf = self.read_postgresql_conf()?;
@@ -960,7 +922,7 @@ impl Endpoint {
.body(
serde_json::to_string(&ConfigurationRequest {
spec,
compute_ctl_config,
compute_ctl_config: ComputeCtlConfig::default(),
})
.unwrap(),
)

View File

@@ -10,7 +10,6 @@ mod background_process;
pub mod broker;
pub mod endpoint;
pub mod local_env;
pub mod object_storage;
pub mod pageserver;
pub mod postgresql_conf;
pub mod safekeeper;

View File

@@ -15,10 +15,9 @@ use clap::ValueEnum;
use postgres_backend::AuthType;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use utils::auth::encode_from_key_file;
use utils::auth::{Claims, encode_from_key_file};
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
use crate::safekeeper::SafekeeperNode;
@@ -56,7 +55,6 @@ pub struct LocalEnv {
// used to issue tokens during e.g pg start
pub private_key_path: PathBuf,
pub public_key_path: PathBuf,
pub broker: NeonBroker,
@@ -70,8 +68,6 @@ pub struct LocalEnv {
pub safekeepers: Vec<SafekeeperConf>,
pub object_storage: ObjectStorageConf,
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
// be propagated into each pageserver's configuration.
pub control_plane_api: Url,
@@ -99,7 +95,6 @@ pub struct OnDiskConfig {
pub neon_distrib_dir: PathBuf,
pub default_tenant_id: Option<TenantId>,
pub private_key_path: PathBuf,
pub public_key_path: PathBuf,
pub broker: NeonBroker,
pub storage_controller: NeonStorageControllerConf,
#[serde(
@@ -108,7 +103,6 @@ pub struct OnDiskConfig {
)]
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub object_storage: ObjectStorageConf,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
@@ -142,18 +136,11 @@ pub struct NeonLocalInitConf {
pub storage_controller: Option<NeonStorageControllerConf>,
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub object_storage: ObjectStorageConf,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
}
#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct ObjectStorageConf {
pub port: u16,
}
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
@@ -411,10 +398,6 @@ impl LocalEnv {
self.pg_dir(pg_version, "lib")
}
pub fn object_storage_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("object_storage")
}
pub fn pageserver_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("pageserver")
}
@@ -448,10 +431,6 @@ impl LocalEnv {
self.base_data_dir.join("safekeepers").join(data_dir_name)
}
pub fn object_storage_data_dir(&self) -> PathBuf {
self.base_data_dir.join("object_storage")
}
pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
Ok(conf)
@@ -603,7 +582,6 @@ impl LocalEnv {
neon_distrib_dir,
default_tenant_id,
private_key_path,
public_key_path,
broker,
storage_controller,
pageservers,
@@ -613,7 +591,6 @@ impl LocalEnv {
control_plane_compute_hook_api: _,
branch_name_mappings,
generate_local_ssl_certs,
object_storage,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -621,7 +598,6 @@ impl LocalEnv {
neon_distrib_dir,
default_tenant_id,
private_key_path,
public_key_path,
broker,
storage_controller,
pageservers,
@@ -630,7 +606,6 @@ impl LocalEnv {
control_plane_hooks_api,
branch_name_mappings,
generate_local_ssl_certs,
object_storage,
}
};
@@ -730,7 +705,6 @@ impl LocalEnv {
neon_distrib_dir: self.neon_distrib_dir.clone(),
default_tenant_id: self.default_tenant_id,
private_key_path: self.private_key_path.clone(),
public_key_path: self.public_key_path.clone(),
broker: self.broker.clone(),
storage_controller: self.storage_controller.clone(),
pageservers: vec![], // it's skip_serializing anyway
@@ -740,7 +714,6 @@ impl LocalEnv {
control_plane_compute_hook_api: None,
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
object_storage: self.object_storage.clone(),
},
)
}
@@ -757,7 +730,7 @@ impl LocalEnv {
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
let private_key_path = self.get_private_key_path();
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
@@ -824,7 +797,6 @@ impl LocalEnv {
control_plane_api,
generate_local_ssl_certs,
control_plane_hooks_api,
object_storage,
} = conf;
// Find postgres binaries.
@@ -856,7 +828,6 @@ impl LocalEnv {
)
.context("generate auth keys")?;
let private_key_path = PathBuf::from("auth_private_key.pem");
let public_key_path = PathBuf::from("auth_public_key.pem");
// create the runtime type because the remaining initialization code below needs
// a LocalEnv instance op operation
@@ -867,7 +838,6 @@ impl LocalEnv {
neon_distrib_dir,
default_tenant_id: Some(default_tenant_id),
private_key_path,
public_key_path,
broker,
storage_controller: storage_controller.unwrap_or_default(),
pageservers: pageservers.iter().map(Into::into).collect(),
@@ -876,7 +846,6 @@ impl LocalEnv {
control_plane_hooks_api,
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
object_storage,
};
if generate_local_ssl_certs {
@@ -904,13 +873,8 @@ impl LocalEnv {
.context("pageserver init failed")?;
}
ObjectStorage::from_env(&env)
.init()
.context("object storage init failed")?;
// setup remote remote location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
env.persist_config()
}
@@ -980,7 +944,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
@@ -1010,7 +974,7 @@ fn generate_ssl_cert(
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "ed25519"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])

View File

@@ -1,107 +0,0 @@
use crate::background_process::{self, start_process, stop_process};
use crate::local_env::LocalEnv;
use anyhow::anyhow;
use anyhow::{Context, Result};
use camino::Utf8PathBuf;
use std::io::Write;
use std::time::Duration;
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
pub struct ObjectStorage {
pub bin: Utf8PathBuf,
pub data_dir: Utf8PathBuf,
pub pemfile: Utf8PathBuf,
pub port: u16,
}
impl ObjectStorage {
pub fn from_env(env: &LocalEnv) -> ObjectStorage {
ObjectStorage {
bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
port: env.object_storage.port,
}
}
fn config_path(&self) -> Utf8PathBuf {
self.data_dir.join("object_storage.json")
}
fn listen_addr(&self) -> Utf8PathBuf {
format!("127.0.0.1:{}", self.port).into()
}
pub fn init(&self) -> Result<()> {
println!("Initializing object storage in {:?}", self.data_dir);
let parent = self.data_dir.parent().unwrap();
#[derive(serde::Serialize)]
struct Cfg {
listen: Utf8PathBuf,
pemfile: Utf8PathBuf,
local_path: Utf8PathBuf,
r#type: String,
}
let cfg = Cfg {
listen: self.listen_addr(),
pemfile: parent.join(self.pemfile.clone()),
local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
r#type: "LocalFs".to_string(),
};
std::fs::create_dir_all(self.config_path().parent().unwrap())?;
std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
.context("write object storage config")?;
Ok(())
}
pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
println!("Starting s3 proxy at {}", self.listen_addr());
std::io::stdout().flush().context("flush stdout")?;
let process_status_check = || async {
tokio::time::sleep(Duration::from_millis(500)).await;
let res = reqwest::Client::new()
.get(format!("http://{}/metrics", self.listen_addr()))
.send()
.await;
match res {
Ok(response) if response.status().is_success() => Ok(true),
Ok(_) => Err(anyhow!("Failed to query /metrics")),
Err(e) => Err(anyhow!("Failed to check node status: {e}")),
}
};
let res = start_process(
"object_storage",
&self.data_dir.clone().into_std_path_buf(),
&self.bin.clone().into_std_path_buf(),
vec![self.config_path().to_string()],
vec![("RUST_LOG".into(), "debug".into())],
background_process::InitialPidFile::Create(self.pid_file()),
retry_timeout,
process_status_check,
)
.await;
if res.is_err() {
eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
}
res
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
stop_process(immediate, "object_storage", &self.pid_file())
}
fn log_file(&self) -> Utf8PathBuf {
self.data_dir.join("object_storage.log")
}
fn pid_file(&self) -> Utf8PathBuf {
self.data_dir.join("object_storage.pid")
}
}

View File

@@ -535,11 +535,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
gc_compaction_verification: settings
.remove("gc_compaction_verification")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_verification' as bool")?,
gc_compaction_initial_threshold_kb: settings
.remove("gc_compaction_initial_threshold_kb")
.map(|x| x.parse::<u64>())

View File

@@ -13,9 +13,7 @@ use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
};
use pageserver_api::models::{
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
@@ -84,8 +82,7 @@ impl NeonStorageControllerStopArgs {
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: Option<NodeId>,
pub generation_override: Option<i32>, // only new tenants
pub config: Option<TenantConfig>, // only new tenants
pub generation_override: Option<i32>,
}
#[derive(Serialize, Deserialize)]
@@ -808,7 +805,6 @@ impl StorageController {
tenant_shard_id,
node_id: Some(pageserver_id),
generation_override: None,
config: None,
};
let response = self

View File

@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
let mut node_to_fill_descs = Vec::new();
for desc in node_descs {
let to_drain = nodes.contains(&desc.id);
let to_drain = nodes.iter().any(|id| *id == desc.id);
if to_drain {
node_to_drain_descs.push(desc);
} else {

View File

@@ -11,8 +11,8 @@ generate_id() {
PG_VERSION=${PG_VERSION:-14}
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
CONFIG_FILE=/tmp/config.json
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
SPEC_FILE=/tmp/spec.json
echo "Waiting pageserver become ready."
while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
done
echo "Page server is ready."
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
tenant_id=${TENANT_ID}
@@ -73,27 +73,17 @@ else
ulid_extension=ulid
fi
echo "Adding pgx_ulid"
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
echo "Overwrite tenant id and timeline id in spec file"
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
cat ${CONFIG_FILE}
# TODO(tristan957): Remove these workarounds for backwards compatibility after
# the next compute release. That includes these next few lines and the
# --spec-path in the compute_ctl invocation.
if compute_ctl --help | grep --quiet -- '--config'; then
SPEC_PATH="$CONFIG_FILE"
else
jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
SPEC_PATH=/tmp/spec.json
fi
cat ${SPEC_FILE}
echo "Start compute node"
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
-C "postgresql://cloud_admin@localhost:55433/postgres" \
-b /usr/local/bin/postgres \
--compute-id "compute-$RANDOM" \
--spec-path "$SPEC_PATH"
-S ${SPEC_FILE}

View File

@@ -1,148 +0,0 @@
{
"spec": {
"format_version": 1.0,
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"cluster": {
"cluster_id": "docker_compose",
"name": "docker_compose_test",
"state": "restarted",
"roles": [
{
"name": "cloud_admin",
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
"options": null
}
],
"databases": [
],
"settings": [
{
"name": "fsync",
"value": "off",
"vartype": "bool"
},
{
"name": "wal_level",
"value": "logical",
"vartype": "enum"
},
{
"name": "wal_log_hints",
"value": "on",
"vartype": "bool"
},
{
"name": "log_connections",
"value": "on",
"vartype": "bool"
},
{
"name": "port",
"value": "55433",
"vartype": "integer"
},
{
"name": "shared_buffers",
"value": "1MB",
"vartype": "string"
},
{
"name": "max_connections",
"value": "100",
"vartype": "integer"
},
{
"name": "listen_addresses",
"value": "0.0.0.0",
"vartype": "string"
},
{
"name": "max_wal_senders",
"value": "10",
"vartype": "integer"
},
{
"name": "max_replication_slots",
"value": "10",
"vartype": "integer"
},
{
"name": "wal_sender_timeout",
"value": "5s",
"vartype": "string"
},
{
"name": "wal_keep_size",
"value": "0",
"vartype": "integer"
},
{
"name": "password_encryption",
"value": "md5",
"vartype": "enum"
},
{
"name": "restart_after_crash",
"value": "off",
"vartype": "bool"
},
{
"name": "synchronous_standby_names",
"value": "walproposer",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
"vartype": "string"
},
{
"name": "neon.safekeepers",
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
"vartype": "string"
},
{
"name": "neon.timeline_id",
"value": "TIMELINE_ID",
"vartype": "string"
},
{
"name": "neon.tenant_id",
"value": "TENANT_ID",
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",
"vartype": "string"
},
{
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
},
{
"name": "cron.database",
"value": "postgres",
"vartype": "string"
}
]
},
"delta_operations": [
]
},
"compute_ctl_config": {
"jwks": {
"keys": []
}
}
}

View File

@@ -0,0 +1,141 @@
{
"format_version": 1.0,
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"cluster": {
"cluster_id": "docker_compose",
"name": "docker_compose_test",
"state": "restarted",
"roles": [
{
"name": "cloud_admin",
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
"options": null
}
],
"databases": [
],
"settings": [
{
"name": "fsync",
"value": "off",
"vartype": "bool"
},
{
"name": "wal_level",
"value": "logical",
"vartype": "enum"
},
{
"name": "wal_log_hints",
"value": "on",
"vartype": "bool"
},
{
"name": "log_connections",
"value": "on",
"vartype": "bool"
},
{
"name": "port",
"value": "55433",
"vartype": "integer"
},
{
"name": "shared_buffers",
"value": "1MB",
"vartype": "string"
},
{
"name": "max_connections",
"value": "100",
"vartype": "integer"
},
{
"name": "listen_addresses",
"value": "0.0.0.0",
"vartype": "string"
},
{
"name": "max_wal_senders",
"value": "10",
"vartype": "integer"
},
{
"name": "max_replication_slots",
"value": "10",
"vartype": "integer"
},
{
"name": "wal_sender_timeout",
"value": "5s",
"vartype": "string"
},
{
"name": "wal_keep_size",
"value": "0",
"vartype": "integer"
},
{
"name": "password_encryption",
"value": "md5",
"vartype": "enum"
},
{
"name": "restart_after_crash",
"value": "off",
"vartype": "bool"
},
{
"name": "synchronous_standby_names",
"value": "walproposer",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
"vartype": "string"
},
{
"name": "neon.safekeepers",
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
"vartype": "string"
},
{
"name": "neon.timeline_id",
"value": "TIMELINE_ID",
"vartype": "string"
},
{
"name": "neon.tenant_id",
"value": "TENANT_ID",
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",
"vartype": "string"
},
{
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
},
{
"name": "cron.database",
"value": "postgres",
"vartype": "string"
}
]
},
"delta_operations": [
]
}

View File

@@ -159,7 +159,7 @@ services:
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
- ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute_wrapper/shell/:/shell/
ports:
- 55433:55433 # pg protocol handler

View File

@@ -1,8 +0,0 @@
EXTENSION = pg_jsonschema
DATA = pg_jsonschema--1.0.sql
REGRESS = jsonschema_valid_api jsonschema_edge_cases
REGRESS_OPTS = --load-extension=pg_jsonschema
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -1,87 +0,0 @@
-- Schema with enums, nulls, extra properties disallowed
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json);
jsonschema_is_valid
---------------------
t
(1 row)
-- Valid enum and null email
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": null}'::json
);
jsonschema_validation_errors
------------------------------
{}
(1 row)
-- Invalid enum value
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "disabled", "email": null}'::json
);
jsonschema_validation_errors
----------------------------------------------------------------------
{"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
(1 row)
-- Invalid email format (assuming format is validated)
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": "not-an-email"}'::json
);
jsonschema_validation_errors
-----------------------------------------
{"\"not-an-email\" is not a \"email\""}
(1 row)
-- Extra property not allowed
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "extra": "should not be here"}'::json
);
jsonschema_validation_errors
--------------------------------------------------------------------
{"Additional properties are not allowed ('extra' was unexpected)"}
(1 row)

View File

@@ -1,65 +0,0 @@
-- Define schema
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json);
jsonschema_is_valid
---------------------
t
(1 row)
-- Valid instance
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "alice", "age": 25}'::json
);
jsonschema_validation_errors
------------------------------
{}
(1 row)
-- Invalid instance: missing required "username"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"age": 25}'::json
);
jsonschema_validation_errors
-----------------------------------------
{"\"username\" is a required property"}
(1 row)
-- Invalid instance: wrong type for "age"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "bob", "age": "twenty"}'::json
);
jsonschema_validation_errors
-------------------------------------------
{"\"twenty\" is not of type \"integer\""}
(1 row)

View File

@@ -1,66 +0,0 @@
-- Schema with enums, nulls, extra properties disallowed
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json);
-- Valid enum and null email
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": null}'::json
);
-- Invalid enum value
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "disabled", "email": null}'::json
);
-- Invalid email format (assuming format is validated)
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "email": "not-an-email"}'::json
);
-- Extra property not allowed
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
"email": { "type": ["string", "null"], "format": "email" }
},
"required": ["status"],
"additionalProperties": false
}'::json,
'{"status": "active", "extra": "should not be here"}'::json
);

View File

@@ -1,48 +0,0 @@
-- Define schema
SELECT jsonschema_is_valid('{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json);
-- Valid instance
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "alice", "age": 25}'::json
);
-- Invalid instance: missing required "username"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"age": 25}'::json
);
-- Invalid instance: wrong type for "age"
SELECT jsonschema_validation_errors(
'{
"type": "object",
"properties": {
"username": { "type": "string" },
"age": { "type": "integer" }
},
"required": ["username"]
}'::json,
'{"username": "bob", "age": "twenty"}'::json
);

View File

@@ -1,9 +0,0 @@
EXTENSION = pg_session_jwt
REGRESS = basic_functions
REGRESS_OPTS = --load-extension=$(EXTENSION)
export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -1,35 +0,0 @@
-- Basic functionality tests for pg_session_jwt
-- Test auth.init() function
SELECT auth.init();
init
------
(1 row)
-- Test an invalid JWT
SELECT auth.jwt_session_init('INVALID-JWT');
ERROR: invalid JWT encoding
-- Test creating a session with an expired JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
ERROR: Token used after it has expired
-- Test creating a session with a valid JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
jwt_session_init
------------------
(1 row)
-- Test auth.session() function
SELECT auth.session();
session
-------------------------------------------------------------------------
{"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
(1 row)
-- Test auth.user_id() function
SELECT auth.user_id() AS user_id;
user_id
---------
user123
(1 row)

View File

@@ -1,19 +0,0 @@
-- Basic functionality tests for pg_session_jwt
-- Test auth.init() function
SELECT auth.init();
-- Test an invalid JWT
SELECT auth.jwt_session_init('INVALID-JWT');
-- Test creating a session with an expired JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
-- Test creating a session with a valid JWT
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
-- Test auth.session() function
SELECT auth.session();
-- Test auth.user_id() function
SELECT auth.user_id() AS user_id;

View File

@@ -151,7 +151,7 @@ Example body:
```
{
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
"stripe_size": 2048,
"stripe_size": 32768,
"shards": [
{"node_id": 344, "shard_number": 0},
{"node_id": 722, "shard_number": 1},

View File

@@ -5,14 +5,6 @@ use crate::privilege::Privilege;
use crate::responses::ComputeCtlConfig;
use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
/// When making requests to the `compute_ctl` external HTTP server, the client
/// must specify a set of claims in `Authorization` header JWTs such that
/// `compute_ctl` can authorize the request.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct ComputeClaims {
pub compute_id: String,
}
/// Request of the /configure API
///
/// We now pass only `spec` in the configuration request, but later we can
@@ -38,3 +30,9 @@ pub struct SetRoleGrantsRequest {
pub privileges: Vec<Privilege>,
pub role: PgIdent,
}
/// Request of the /configure_telemetry API
#[derive(Debug, Deserialize, Serialize)]
pub struct ConfigureTelemetryRequest {
pub logs_export_host: Option<String>,
}

View File

@@ -14,32 +14,6 @@ pub struct GenericAPIError {
pub error: String,
}
/// All configuration parameters necessary for a compute. When
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
/// and contains parameters necessary for operating `compute_ctl` independently
/// of whether a tenant is attached to the compute or not.
///
/// This also happens to be the body of `compute_ctl`'s /configure request.
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeConfig {
/// The compute spec
pub spec: Option<ComputeSpec>,
/// The compute_ctl configuration
#[allow(dead_code)]
pub compute_ctl_config: ComputeCtlConfig,
}
impl From<ControlPlaneConfigResponse> for ComputeConfig {
fn from(value: ControlPlaneConfigResponse) -> Self {
Self {
spec: value.spec,
compute_ctl_config: value.compute_ctl_config,
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ExtensionInstallResponse {
pub extension: PgIdent,
@@ -187,7 +161,7 @@ pub struct TlsConfig {
/// Response of the `/computes/{compute_id}/spec` control-plane API.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneConfigResponse {
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
pub compute_ctl_config: ComputeCtlConfig,

View File

@@ -1,8 +1,8 @@
//! The ComputeSpec contains all the information needed to start up
//! the right version of PostgreSQL, and connect it to the storage nodes.
//! It can be passed as part of the `config.json`, or the control plane can
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
//! compute_ctl can fetch it by calling the control plane's API.
//! `ComputeSpec` represents the contents of the spec.json file.
//!
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
//! all the information needed to start up the right version of PostgreSQL,
//! and connect it to the storage nodes.
use std::collections::HashMap;
use indexmap::IndexMap;
@@ -104,12 +104,6 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
// More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs.
pub project_id: Option<String>,
pub branch_id: Option<String>,
pub endpoint_id: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
@@ -165,13 +159,15 @@ pub struct ComputeSpec {
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for compute audit logging
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
#[serde(default)]
pub audit_log_level: ComputeAudit,
/// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
/// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
pub logs_export_host: Option<String>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -183,6 +179,9 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Allow to configure rsyslog for Postgres logs export
PostgresLogsExport,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
/// `parse_unknown_features()` for more details.
@@ -289,25 +288,14 @@ impl ComputeMode {
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
// Deprecated, use Base instead
Log,
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
// logged to the standard postgresql log stream
Base,
// Deprecated, use Full or Extended instead
Hipaa,
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access
Extended,
// (pgaudit.log='all', pgaudit.log_parameter='on'),
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access.
Full,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]

View File

@@ -30,7 +30,6 @@ tokio.workspace = true
tracing.workspace = true
url.workspace = true
uuid.workspace = true
x509-cert.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.

View File

@@ -4,8 +4,6 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use metrics::{IntCounterVec, register_int_counter_vec};
use once_cell::sync::Lazy;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
@@ -28,24 +26,6 @@ pub struct Server {
tls_acceptor: Option<TlsAcceptor>,
}
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_started_total",
"Number of established http/https connections",
&["scheme"]
)
.expect("failed to define a metric")
});
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_errors_total",
"Number of occured connection errors by type",
&["type"]
)
.expect("failed to define a metric")
});
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -80,15 +60,6 @@ impl Server {
false
}
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
@@ -96,7 +67,6 @@ impl Server {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
tcp_error_cnt.inc();
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
@@ -108,18 +78,11 @@ impl Server {
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
let tls_error_cnt = tls_error_cnt.clone();
let http_error_cnt = http_error_cnt.clone();
let https_error_cnt = https_error_cnt.clone();
let http_connection_cnt = http_connection_cnt.clone();
let https_connection_cnt = https_connection_cnt.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
https_connection_cnt.inc();
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
@@ -127,7 +90,6 @@ impl Server {
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
tls_error_cnt.inc();
if !suppress_io_error(&err) {
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
}
@@ -135,7 +97,6 @@ impl Server {
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
https_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
}
@@ -143,9 +104,7 @@ impl Server {
}
None => {
// Handle HTTP connection.
http_connection_cnt.inc();
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
http_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
}
@@ -156,7 +115,6 @@ impl Server {
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}
@@ -164,7 +122,6 @@ impl Server {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}

View File

@@ -3,14 +3,11 @@ use std::{sync::Arc, time::Duration};
use anyhow::Context;
use arc_swap::ArcSwap;
use camino::Utf8Path;
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
use rustls::{
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
pki_types::{CertificateDer, PrivateKeyDer},
server::{ClientHello, ResolvesServerCert},
sign::CertifiedKey,
};
use x509_cert::der::Reader;
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let cert_data = tokio::fs::read(filename)
@@ -56,76 +53,6 @@ pub async fn load_certified_key(
Ok(certified_key)
}
/// rustls's CertifiedKey with extra parsed fields used for metrics.
struct ParsedCertifiedKey {
certified_key: CertifiedKey,
expiration_time: UnixTime,
}
/// Parse expiration time from an X509 certificate.
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
let parsed_cert = x509_cert::der::SliceReader::new(cert)
.context("Failed to parse cerficiate")?
.decode::<x509_cert::Certificate>()
.context("Failed to parse cerficiate")?;
Ok(UnixTime::since_unix_epoch(
parsed_cert
.tbs_certificate
.validity
.not_after
.to_unix_duration(),
))
}
async fn load_and_parse_certified_key(
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
) -> anyhow::Result<ParsedCertifiedKey> {
let certified_key = load_certified_key(key_filename, cert_filename).await?;
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
Ok(ParsedCertifiedKey {
certified_key,
expiration_time,
})
}
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"tls_certs_expiration_time_seconds",
"Expiration time of the loaded certificate since unix epoch in seconds",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_started_total",
"Number of certificate reload loop iterations started",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_updated_total",
"Number of times the certificate was updated to the new one",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_failed_total",
"Number of times the certificate reload failed",
&["resolver_name"]
)
.expect("failed to define a metric")
});
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
/// the disk periodically.
#[derive(Debug)]
@@ -136,28 +63,16 @@ pub struct ReloadingCertificateResolver {
impl ReloadingCertificateResolver {
/// Creates a new Resolver by loading certificate and private key from FS and
/// creating tokio::task to reload them with provided reload_period.
/// resolver_name is used as metric's label.
pub async fn new(
resolver_name: &str,
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
reload_period: Duration,
) -> anyhow::Result<Arc<Self>> {
// Create metrics for current resolver.
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
let cert_reload_started_counter =
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_updated_counter =
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_failed_counter =
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
let this = Arc::new(Self {
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
certified_key: ArcSwap::from_pointee(
load_certified_key(key_filename, cert_filename).await?,
),
});
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
tokio::spawn({
let weak_this = Arc::downgrade(&this);
@@ -173,22 +88,17 @@ impl ReloadingCertificateResolver {
Some(this) => this,
None => break, // Resolver has been destroyed, exit.
};
cert_reload_started_counter.inc();
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
Ok(parsed_key) => {
if parsed_key.certified_key.cert == this.certified_key.load().cert {
match load_certified_key(&key_filename, &cert_filename).await {
Ok(new_certified_key) => {
if new_certified_key.cert == this.certified_key.load().cert {
tracing::debug!("Certificate has not changed since last reloading");
} else {
tracing::info!("Certificate has been reloaded");
this.certified_key.store(Arc::new(parsed_key.certified_key));
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
cert_reload_updated_counter.inc();
this.certified_key.store(Arc::new(new_certified_key));
}
last_reload_failed = false;
}
Err(err) => {
cert_reload_failed_counter.inc();
// Note: Reloading certs may fail if it conflicts with the script updating
// the files at the same time. Warn only if the error is persistent.
if last_reload_failed {

View File

@@ -180,7 +180,6 @@ pub struct ConfigToml {
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
pub tracing: Option<Tracing>,
pub enable_tls_page_service_api: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -207,10 +206,6 @@ pub struct PageServicePipeliningConfigPipelined {
/// Causes runtime errors if larger than max get_vectored batch size.
pub max_batch_size: NonZeroUsize,
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
// The default below is such that new versions of the software can start
// with the old configuration.
#[serde(default)]
pub batching: PageServiceProtocolPipelinedBatchingStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -220,19 +215,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
Tasks,
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum PageServiceProtocolPipelinedBatchingStrategy {
/// All get page requests in a batch will be at the same LSN
#[default]
UniformLsn,
/// Get page requests in a batch may be at different LSN
///
/// One key cannot be present more than once at different LSNs in
/// the same batch.
ScatteredLsn,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case")]
pub enum GetVectoredConcurrentIo {
@@ -469,8 +451,6 @@ pub struct TenantConfigToml {
// gc-compaction related configs
/// Enable automatic gc-compaction trigger on this tenant.
pub gc_compaction_enabled: bool,
/// Enable verification of gc-compaction results.
pub gc_compaction_verification: bool,
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
/// gc-compaction will be triggered.
pub gc_compaction_initial_threshold_kb: u64,
@@ -632,12 +612,9 @@ impl Default for ConfigToml {
page_service_pipelining: if !cfg!(test) {
PageServicePipeliningConfig::Serial
} else {
// Do not turn this into the default until scattered reads have been
// validated and rolled-out fully.
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
max_batch_size: NonZeroUsize::new(32).unwrap(),
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
})
},
get_vectored_concurrent_io: if !cfg!(test) {
@@ -654,7 +631,6 @@ impl Default for ConfigToml {
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
tracing: None,
enable_tls_page_service_api: false,
}
}
}
@@ -714,7 +690,6 @@ pub mod tenant_conf_defaults {
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
}
@@ -769,7 +744,6 @@ impl Default for TenantConfigToml {
wal_receiver_protocol_override: None,
rel_size_v2_enabled: false,
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
sampling_ratio: None,

View File

@@ -7,8 +7,7 @@ use std::time::{Duration, Instant};
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::id::{NodeId, TenantId};
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
use crate::shard::{ShardStripeSize, TenantShardId};
@@ -500,15 +499,6 @@ pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
/// Import request for safekeeper timelines.
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineImportRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub start_lsn: Lsn,
pub sk_set: Vec<NodeId>,
}
#[cfg(test)]
mod test {
use serde_json;

View File

@@ -927,7 +927,7 @@ impl Key {
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
#[inline(always)]
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match self.field1 {
0x00 => (
RelTag {
@@ -938,7 +938,7 @@ impl Key {
},
self.field6,
),
_ => return Err(ToRelBlockError(self.field1)),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
})
}
}
@@ -951,17 +951,6 @@ impl std::str::FromStr for Key {
}
}
#[derive(Debug)]
pub struct ToRelBlockError(u8);
impl fmt::Display for ToRelBlockError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unexpected value kind 0x{:02x}", self.0)
}
}
impl std::error::Error for ToRelBlockError {}
#[cfg(test)]
mod tests {
use std::str::FromStr;

View File

@@ -613,7 +613,8 @@ mod tests {
use rand::{RngCore, SeedableRng};
use super::*;
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};
use crate::models::ShardParameters;
use crate::shard::{ShardCount, ShardNumber};
// Helper function to create a key range.
//
@@ -963,8 +964,12 @@ mod tests {
}
#[test]
fn sharded_range_relation_gap() {
let shard_identity =
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
@@ -980,8 +985,12 @@ mod tests {
#[test]
fn shard_identity_keyspaces_single_key() {
let shard_identity =
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
@@ -1025,8 +1034,12 @@ mod tests {
#[test]
fn shard_identity_keyspaces_forkno_gap() {
let shard_identity =
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
@@ -1048,7 +1061,7 @@ mod tests {
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
ShardCount::new(4),
DEFAULT_STRIPE_SIZE,
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
@@ -1131,44 +1144,37 @@ mod tests {
/// for a single tenant.
#[test]
fn sharded_range_fragment_simple() {
const SHARD_COUNT: u8 = 4;
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(SHARD_COUNT),
ShardStripeSize(STRIPE_SIZE),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which we happen to know covers exactly one stripe which belongs to this shard
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let mut input_end = input_start;
input_end.field6 += STRIPE_SIZE; // field6 is block number
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
// Ask for stripe_size blocks, we get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
do_fragment(input_start, input_end, &shard_identity, 32768),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for more, we still get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
do_fragment(input_start, input_end, &shard_identity, 10000000),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for target_nblocks of half the stripe size, we get two halves
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
do_fragment(input_start, input_end, &shard_identity, 16384),
(
STRIPE_SIZE,
32768,
vec![
(
STRIPE_SIZE / 2,
input_start..input_start.add(STRIPE_SIZE / 2)
),
(STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
(16384, input_start..input_start.add(16384)),
(16384, input_start.add(16384)..input_end)
]
)
);
@@ -1176,53 +1182,40 @@ mod tests {
#[test]
fn sharded_range_fragment_multi_stripe() {
const SHARD_COUNT: u8 = 4;
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(SHARD_COUNT),
ShardStripeSize(STRIPE_SIZE),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let mut input_end = input_start;
input_end.field6 += RANGE_SIZE; // field6 is block number
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
// Ask for all the blocks, get a fragment that covers the whole range but reports
// its size to be just the blocks belonging to our shard.
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
do_fragment(input_start, input_end, &shard_identity, 131072),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for a sub-stripe quantity that results in 3 fragments.
let limit = STRIPE_SIZE / 3 + 1;
// Ask for a sub-stripe quantity
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, limit),
do_fragment(input_start, input_end, &shard_identity, 16000),
(
STRIPE_SIZE,
32768,
vec![
(limit, input_start..input_start.add(limit)),
(limit, input_start.add(limit)..input_start.add(2 * limit)),
(
STRIPE_SIZE - 2 * limit,
input_start.add(2 * limit)..input_end
),
(16000, input_start..input_start.add(16000)),
(16000, input_start.add(16000)..input_start.add(32000)),
(768, input_start.add(32000)..input_end),
]
)
);
// Try on a range that starts slightly after our owned stripe
assert_eq!(
do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
(
STRIPE_SIZE - 1,
vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
)
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
(32767, vec![(32767, input_start.add(1)..input_end)])
);
}
@@ -1230,40 +1223,32 @@ mod tests {
/// a previous relation.
#[test]
fn sharded_range_fragment_starting_from_logical_size() {
const SHARD_COUNT: u8 = 4;
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
input_end.field6 += RANGE_SIZE; // field6 is block number
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(SHARD_COUNT),
ShardStripeSize(STRIPE_SIZE),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
(
STRIPE_SIZE + 1,
vec![(STRIPE_SIZE + 1, input_start..input_end)]
)
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x8001, vec![(0x8001, input_start..input_end)])
);
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
// store all logical sizes)
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(SHARD_COUNT),
ShardStripeSize(STRIPE_SIZE),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
(1, vec![(1, input_start..input_end)])
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x1, vec![(0x1, input_start..input_end)])
);
}
@@ -1299,8 +1284,12 @@ mod tests {
);
// Same, but using a sharded identity
let shard_identity =
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
@@ -1342,7 +1331,7 @@ mod tests {
ShardIdentity::new(
ShardNumber((prng.next_u32() % shard_count) as u8),
ShardCount::new(shard_count as u8),
DEFAULT_STRIPE_SIZE,
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap()
};

View File

@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
use crate::config::Ratio;
use crate::key::{CompactKey, Key};
use crate::reltag::RelTag;
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
/// The state of a tenant in this pageserver.
///
@@ -80,22 +80,10 @@ pub enum TenantState {
///
/// Transitions out of this state are possible through `set_broken()`.
Stopping {
/// The barrier can be used to wait for shutdown to complete. The first caller to set
/// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers
/// will wait for the first caller's existing barrier.
///
/// None is set when an attach is cancelled, to signal to shutdown that the attach has in
/// fact cancelled:
///
/// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant.
/// 2. `attach` sets `TenantState::Stopping(None)` and exits.
/// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets
/// `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner.
//
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
// otherwise it will not be skipped during deserialization
#[serde(skip)]
progress: Option<completion::Barrier>,
progress: completion::Barrier,
},
/// The tenant is recognized by the pageserver, but can no longer be used for
/// any operations.
@@ -438,6 +426,8 @@ pub struct ShardParameters {
}
impl ShardParameters {
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
pub fn is_unsharded(&self) -> bool {
self.count.is_unsharded()
}
@@ -447,7 +437,7 @@ impl Default for ShardParameters {
fn default() -> Self {
Self {
count: ShardCount::new(0),
stripe_size: DEFAULT_STRIPE_SIZE,
stripe_size: Self::DEFAULT_STRIPE_SIZE,
}
}
}
@@ -576,8 +566,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_enabled: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_verification: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -698,9 +686,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_enabled: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_verification: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_initial_threshold_kb: Option<u64>,
@@ -749,7 +734,6 @@ impl TenantConfig {
mut wal_receiver_protocol_override,
mut rel_size_v2_enabled,
mut gc_compaction_enabled,
mut gc_compaction_verification,
mut gc_compaction_initial_threshold_kb,
mut gc_compaction_ratio_percent,
mut sampling_ratio,
@@ -841,9 +825,6 @@ impl TenantConfig {
patch
.gc_compaction_enabled
.apply(&mut gc_compaction_enabled);
patch
.gc_compaction_verification
.apply(&mut gc_compaction_verification);
patch
.gc_compaction_initial_threshold_kb
.apply(&mut gc_compaction_initial_threshold_kb);
@@ -885,7 +866,6 @@ impl TenantConfig {
wal_receiver_protocol_override,
rel_size_v2_enabled,
gc_compaction_enabled,
gc_compaction_verification,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
sampling_ratio,
@@ -984,9 +964,6 @@ impl TenantConfig {
gc_compaction_enabled: self
.gc_compaction_enabled
.unwrap_or(global_conf.gc_compaction_enabled),
gc_compaction_verification: self
.gc_compaction_verification
.unwrap_or(global_conf.gc_compaction_verification),
gc_compaction_initial_threshold_kb: self
.gc_compaction_initial_threshold_kb
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
@@ -1691,7 +1668,6 @@ pub struct SecondaryProgress {
pub struct TenantScanRemoteStorageShard {
pub tenant_shard_id: TenantShardId,
pub generation: Option<u32>,
pub stripe_size: Option<ShardStripeSize>,
}
#[derive(Serialize, Deserialize, Debug, Default)]
@@ -2743,15 +2719,10 @@ mod tests {
"Activating",
),
(line!(), TenantState::Active, "Active"),
(
line!(),
TenantState::Stopping { progress: None },
"Stopping",
),
(
line!(),
TenantState::Stopping {
progress: Some(completion::Barrier::default()),
progress: utils::completion::Barrier::default(),
},
"Stopping",
),

View File

@@ -58,8 +58,6 @@ pub enum NeonWalRecord {
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
/// its references in `timeline.rs`.
will_init: bool,
/// Only append the record if the current image is the same as the one specified in this field.
only_if: Option<String>,
},
}
@@ -83,17 +81,6 @@ impl NeonWalRecord {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
only_if: None,
}
}
#[cfg(feature = "testing")]
pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
Self::Test {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
only_if: Some(only_if.as_ref().to_string()),
}
}
@@ -103,7 +90,6 @@ impl NeonWalRecord {
append: s.as_ref().to_string(),
clear: true,
will_init: false,
only_if: None,
}
}
@@ -113,7 +99,6 @@ impl NeonWalRecord {
append: s.as_ref().to_string(),
clear: true,
will_init: true,
only_if: None,
}
}
}

View File

@@ -78,12 +78,6 @@ impl Default for ShardStripeSize {
}
}
impl std::fmt::Display for ShardStripeSize {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
pub struct ShardLayout(u8);
@@ -92,11 +86,8 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
/// ShardIdentity uses a magic layout value to indicate if it is unusable
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
///
/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError {
@@ -546,7 +537,7 @@ mod tests {
field6: 0x7d06,
};
let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
assert_eq!(shard, ShardNumber(8));
}

View File

@@ -5,6 +5,7 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use std::future::Future;
use std::io::ErrorKind;
use std::net::SocketAddr;
use std::os::fd::{AsRawFd, RawFd};
use std::pin::Pin;
@@ -226,7 +227,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
match self {
MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
MaybeWriteOnly::WriteOnly(_) => {
Err(io::Error::other("reading from write only half").into())
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
}
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
}
@@ -236,7 +237,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
match self {
MaybeWriteOnly::Full(framed) => framed.read_message().await,
MaybeWriteOnly::WriteOnly(_) => {
Err(io::Error::other("reading from write only half").into())
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
}
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
}
@@ -974,7 +975,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
.write_message_noflush(&BeMessage::CopyData(buf))
// write_message only writes to the buffer, so it can fail iff the
// message is invaid, but CopyData can't be invalid.
.map_err(|_| io::Error::other("failed to serialize CopyData"))?;
.map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
Poll::Ready(Ok(buf.len()))
}

View File

@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
cert
});
// test that basic select with ssl works

View File

@@ -35,7 +35,7 @@ impl ConnectionError {
pub fn into_io_error(self) -> io::Error {
match self {
ConnectionError::Io(io) => io,
ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
}
}
}

View File

@@ -257,7 +257,7 @@ pub enum ProtocolError {
impl ProtocolError {
/// Proxy stream.rs uses only io::Error; provide it.
pub fn into_io_error(self) -> io::Error {
io::Error::other(self.to_string())
io::Error::new(io::ErrorKind::Other, self.to_string())
}
}

View File

@@ -212,7 +212,7 @@ impl ScramSha256 {
password,
channel_binding,
} => (nonce, password, channel_binding),
_ => return Err(io::Error::other("invalid SCRAM state")),
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
};
let message =
@@ -291,7 +291,7 @@ impl ScramSha256 {
server_key,
auth_message,
} => (server_key, auth_message),
_ => return Err(io::Error::other("invalid SCRAM state")),
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
};
let message =
@@ -301,7 +301,10 @@ impl ScramSha256 {
let verifier = match parsed {
ServerFinalMessage::Error(e) => {
return Err(io::Error::other(format!("SCRAM error: {}", e)));
return Err(io::Error::new(
io::ErrorKind::Other,
format!("SCRAM error: {}", e),
));
}
ServerFinalMessage::Verifier(verifier) => verifier,
};

View File

@@ -1,13 +0,0 @@
[package]
name = "remote_keys"
version = "0.1.0"
edition = "2024"
license.workspace = true
[dependencies]
anyhow.workspace = true
utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
rand.workspace = true

View File

@@ -1,42 +0,0 @@
//! A module that provides a KMS implementation that generates and unwraps the keys.
//!
/// A KMS implementation that does static wrapping and unwrapping of the keys.
pub struct NaiveKms {
account_id: String,
}
impl NaiveKms {
pub fn new(account_id: String) -> Self {
Self { account_id }
}
pub fn encrypt(&self, plain: &[u8]) -> anyhow::Result<Vec<u8>> {
let wrapped = [self.account_id.as_bytes(), "-wrapped-".as_bytes(), plain].concat();
Ok(wrapped)
}
pub fn decrypt(&self, wrapped: &[u8]) -> anyhow::Result<Vec<u8>> {
let Some(wrapped) = wrapped.strip_prefix(self.account_id.as_bytes()) else {
return Err(anyhow::anyhow!("invalid key"));
};
let Some(plain) = wrapped.strip_prefix(b"-wrapped-") else {
return Err(anyhow::anyhow!("invalid key"));
};
Ok(plain.to_vec())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_generate_key() {
let kms = NaiveKms::new("test-tenant".to_string());
let data = rand::random::<[u8; 32]>().to_vec();
let encrypted = kms.encrypt(&data).unwrap();
let decrypted = kms.decrypt(&encrypted).unwrap();
assert_eq!(data, decrypted);
}
}

View File

@@ -13,7 +13,6 @@ aws-smithy-async.workspace = true
aws-smithy-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
base64.workspace = true
bytes.workspace = true
camino = { workspace = true, features = ["serde1"] }
humantime-serde.workspace = true
@@ -28,9 +27,8 @@ tokio-util = { workspace = true, features = ["compat"] }
toml_edit.workspace = true
tracing.workspace = true
scopeguard.workspace = true
md5.workspace = true
metrics.workspace = true
utils = { path = "../utils", default-features = false }
utils.workspace = true
pin-project-lite.workspace = true
azure_core.workspace = true

View File

@@ -550,19 +550,6 @@ impl RemoteStorage for AzureBlobStorage {
self.download_for_builder(builder, timeout, cancel).await
}
#[allow(unused_variables)]
async fn upload_with_encryption(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
unimplemented!()
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
self.delete_objects(std::array::from_ref(path), cancel)
.await
@@ -814,7 +801,8 @@ where
// that support needs to be hacked in.
//
// including {self:?} into the message would be useful, but unsure how to unproject.
_ => std::task::Poll::Ready(Err(std::io::Error::other(
_ => std::task::Poll::Ready(Err(std::io::Error::new(
std::io::ErrorKind::Other,
"cloned or initial values cannot be read",
))),
}
@@ -867,7 +855,7 @@ where
};
Err(azure_core::error::Error::new(
azure_core::error::ErrorKind::Io,
std::io::Error::other(msg),
std::io::Error::new(std::io::ErrorKind::Other, msg),
))
}

View File

@@ -190,8 +190,6 @@ pub struct DownloadOpts {
/// timeouts: for something like an index/manifest/heatmap, we should time out faster than
/// for layer files
pub kind: DownloadKind,
/// The encryption key to use for the download.
pub encryption_key: Option<Vec<u8>>,
}
pub enum DownloadKind {
@@ -206,7 +204,6 @@ impl Default for DownloadOpts {
byte_start: Bound::Unbounded,
byte_end: Bound::Unbounded,
kind: DownloadKind::Large,
encryption_key: None,
}
}
}
@@ -244,15 +241,6 @@ impl DownloadOpts {
None => format!("bytes={start}-"),
})
}
pub fn with_encryption_key(mut self, encryption_key: Option<impl AsRef<[u8]>>) -> Self {
self.encryption_key = encryption_key.map(|k| k.as_ref().to_vec());
self
}
pub fn encryption_key(&self) -> Option<&[u8]> {
self.encryption_key.as_deref()
}
}
/// Storage (potentially remote) API to manage its state.
@@ -343,19 +331,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
cancel: &CancellationToken,
) -> Result<Download, DownloadError>;
/// Same as upload, but with remote encryption if the backend supports it (e.g. SSE-C on AWS).
async fn upload_with_encryption(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()>;
/// Delete a single path from remote storage.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -640,63 +615,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
}
pub async fn upload_with_encryption(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => {
s.upload_with_encryption(
from,
data_size_bytes,
to,
metadata,
encryption_key,
cancel,
)
.await
}
Self::AwsS3(s) => {
s.upload_with_encryption(
from,
data_size_bytes,
to,
metadata,
encryption_key,
cancel,
)
.await
}
Self::AzureBlob(s) => {
s.upload_with_encryption(
from,
data_size_bytes,
to,
metadata,
encryption_key,
cancel,
)
.await
}
Self::Unreliable(s) => {
s.upload_with_encryption(
from,
data_size_bytes,
to,
metadata,
encryption_key,
cancel,
)
.await
}
}
}
}
impl GenericRemoteStorage {

View File

@@ -560,19 +560,6 @@ impl RemoteStorage for LocalFs {
}
}
#[allow(unused_variables)]
async fn upload_with_encryption(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
unimplemented!()
}
async fn delete_objects(
&self,
paths: &[RemotePath],

View File

@@ -66,10 +66,7 @@ struct GetObjectRequest {
key: String,
etag: Option<String>,
range: Option<String>,
/// Base64 encoded SSE-C key for server-side encryption.
sse_c_key: Option<Vec<u8>>,
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
@@ -260,13 +257,6 @@ impl S3Bucket {
builder = builder.if_none_match(etag);
}
if let Some(encryption_key) = request.sse_c_key {
builder = builder.sse_customer_algorithm("AES256");
builder = builder.sse_customer_key(base64::encode(&encryption_key));
builder = builder
.sse_customer_key_md5(base64::encode(md5::compute(&encryption_key).as_slice()));
}
let get_object = builder.send();
let get_object = tokio::select! {
@@ -703,13 +693,12 @@ impl RemoteStorage for S3Bucket {
})
}
async fn upload_with_encryption(
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Put;
@@ -720,7 +709,7 @@ impl RemoteStorage for S3Bucket {
let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
let mut upload = self
let upload = self
.client
.put_object()
.bucket(self.bucket_name.clone())
@@ -728,17 +717,8 @@ impl RemoteStorage for S3Bucket {
.set_metadata(metadata.map(|m| m.0))
.set_storage_class(self.upload_storage_class.clone())
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream);
if let Some(encryption_key) = encryption_key {
upload = upload.sse_customer_algorithm("AES256");
let base64_key = base64::encode(encryption_key);
upload = upload.sse_customer_key(&base64_key);
upload = upload
.sse_customer_key_md5(base64::encode(md5::compute(encryption_key).as_slice()));
}
let upload = upload.send();
.body(bytes_stream)
.send();
let upload = tokio::time::timeout(self.timeout, upload);
@@ -762,18 +742,6 @@ impl RemoteStorage for S3Bucket {
}
}
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
self.upload_with_encryption(from, data_size_bytes, to, metadata, None, cancel)
.await
}
async fn copy(
&self,
from: &RemotePath,
@@ -833,7 +801,6 @@ impl RemoteStorage for S3Bucket {
key: self.relative_path_to_s3_object(from),
etag: opts.etag.as_ref().map(|e| e.to_string()),
range: opts.byte_range_header(),
sse_c_key: opts.encryption_key.clone(),
},
cancel,
)

View File

@@ -178,19 +178,6 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.download(from, opts, cancel).await
}
#[allow(unused_variables)]
async fn upload_with_encryption(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
encryption_key: Option<&[u8]>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
unimplemented!()
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
self.delete_inner(path, true, cancel).await
}

View File

@@ -421,7 +421,7 @@ async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
))
.unwrap();
let len = upload_large_enough_file(&ctx.client, &path, &cancel, None).await;
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
let timeout = std::time::Duration::from_secs(5);
@@ -500,7 +500,7 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
))
.unwrap();
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel, None).await;
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
{
let stream = ctx
@@ -555,7 +555,6 @@ async fn upload_large_enough_file(
client: &GenericRemoteStorage,
path: &RemotePath,
cancel: &CancellationToken,
encryption_key: Option<&[u8]>,
) -> usize {
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
let body = bytes::Bytes::from(vec![0u8; 1024]);
@@ -566,54 +565,9 @@ async fn upload_large_enough_file(
let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
client
.upload_with_encryption(contents, len, path, None, encryption_key, cancel)
.upload(contents, len, path, None, cancel)
.await
.expect("upload succeeds");
len
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn encryption_works(ctx: &mut MaybeEnabledStorage) {
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
return;
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
))
.unwrap();
let key = rand::random::<[u8; 32]>();
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel, Some(&key)).await;
{
let download = ctx
.client
.download(
&path,
&DownloadOpts::default().with_encryption_key(Some(&key)),
&cancel,
)
.await
.expect("should succeed");
let vec = download_to_vec(download).await.expect("should succeed");
assert_eq!(vec.len(), file_len);
}
{
// Download without encryption key should fail
let download = ctx
.client
.download(&path, &DownloadOpts::default(), &cancel)
.await;
assert!(download.is_err());
}
let cancel = CancellationToken::new();
ctx.client.delete_objects(&[path], &cancel).await.unwrap();
}

View File

@@ -5,8 +5,7 @@ edition.workspace = true
license.workspace = true
[features]
default = ["rename_noreplace"]
rename_noreplace = []
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
@@ -36,7 +35,7 @@ serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["signal"] }
tokio.workspace = true
tokio-tar.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = ["serde"] }

View File

@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
let key = EncodingKey::from_ed_pem(key_data)?;
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
}

View File

@@ -81,9 +81,12 @@ pub fn path_with_suffix_extension(
}
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
let parent = file_path
.parent()
.ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;
let parent = file_path.parent().ok_or_else(|| {
io::Error::new(
io::ErrorKind::Other,
format!("File {file_path:?} has no parent"),
)
})?;
fsync(file_path)?;
fsync(parent)?;

View File

@@ -3,9 +3,7 @@ use std::{fs, io, path::Path};
use anyhow::Context;
#[cfg(feature = "rename_noreplace")]
mod rename_noreplace;
#[cfg(feature = "rename_noreplace")]
pub use rename_noreplace::rename_noreplace;
pub trait PathExt {

View File

@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
dst: &P2,
) -> nix::Result<()> {
{
#[cfg(all(target_os = "linux", target_env = "gnu"))]
#[cfg(target_os = "linux")]
{
nix::fcntl::renameat2(
None,
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
})??;
nix::errno::Errno::result(res).map(drop)
}
#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
{
std::compile_error!("OS does not support no-replace renames");
}

View File

@@ -1,8 +1,6 @@
pub use signal_hook::consts::TERM_SIGNALS;
pub use signal_hook::consts::signal::*;
use signal_hook::iterator::Signals;
use tokio::signal::unix::{SignalKind, signal};
use tracing::info;
pub enum Signal {
Quit,
@@ -38,30 +36,3 @@ impl ShutdownSignals {
Ok(())
}
}
/// Runs in a loop since we want to be responsive to multiple signals
/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
/// <https://github.com/neondatabase/neon/issues/9740>
pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
let mut sigint = signal(SignalKind::interrupt()).unwrap();
let mut sigterm = signal(SignalKind::terminate()).unwrap();
let mut sigquit = signal(SignalKind::quit()).unwrap();
loop {
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
std::process::exit(111);
}
_ = sigint.recv() => "SIGINT",
_ = sigterm.recv() => "SIGTERM",
};
if !token.is_cancelled() {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
token.cancel();
} else {
info!("Got signal {signal}. Already shutting down.");
}
}
}

View File

@@ -1,28 +0,0 @@
[package]
name = "object_storage"
version = "0.0.1"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
axum-extra.workspace = true
axum.workspace = true
camino.workspace = true
futures.workspace = true
jsonwebtoken.workspace = true
prometheus.workspace = true
remote_storage.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio-util.workspace = true
tokio.workspace = true
tracing.workspace = true
utils = { path = "../libs/utils", default-features = false }
workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
http-body-util.workspace = true
itertools.workspace = true
rand.workspace = true
test-log.workspace = true
tower.workspace = true

View File

@@ -1,561 +0,0 @@
use anyhow::anyhow;
use axum::body::{Body, Bytes};
use axum::response::{IntoResponse, Response};
use axum::{Router, http::StatusCode};
use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
use remote_storage::TimeoutOrCancel;
use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use utils::backoff::retry;
pub fn app(state: Arc<Storage>) -> Router<()> {
use axum::routing::{delete as _delete, get as _get};
let delete_prefix = _delete(delete_prefix);
Router::new()
.route(
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
_get(get).put(set).delete(delete),
)
.route(
"/{tenant_id}/{timeline_id}/{endpoint_id}",
delete_prefix.clone(),
)
.route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
.route("/{tenant_id}", delete_prefix)
.route("/metrics", _get(metrics))
.route("/status", _get(async || StatusCode::OK.into_response()))
.with_state(state)
}
type Result = anyhow::Result<Response, Response>;
type State = axum::extract::State<Arc<Storage>>;
const CONTENT_TYPE: &str = "content-type";
const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
const WARN_THRESHOLD: u32 = 3;
const MAX_RETRIES: u32 = 10;
async fn metrics() -> Result {
prometheus::TextEncoder::new()
.encode_to_string(&prometheus::gather())
.map(|s| s.into_response())
.map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
}
async fn get(S3Path { path }: S3Path, state: State) -> Result {
info!(%path, "downloading");
let download_err = |e| {
if let DownloadError::NotFound = e {
info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
return not_found(&path);
}
internal_error(e, &path, "downloading")
};
let cancel = state.cancel.clone();
let opts = &DownloadOpts::default();
let stream = retry(
async || state.storage.download(&path, opts, &cancel).await,
DownloadError::is_permanent,
WARN_THRESHOLD,
MAX_RETRIES,
"downloading",
&cancel,
)
.await
.unwrap_or(Err(DownloadError::Cancelled))
.map_err(download_err)?
.download_stream;
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
.body(Body::from_stream(stream))
.map_err(|e| internal_error(e, path, "reading response"))
}
// Best solution for files is multipart upload, but remote_storage doesn't support it,
// so we can either read Bytes in memory and push at once or forward BodyDataStream to
// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
// guaranteed size() which may produce issues while uploading to s3.
// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
// very large files.
async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
info!(%path, "uploading");
let request_len = bytes.len();
let max_len = state.max_upload_file_limit;
if request_len > max_len {
return Err(bad_request(
anyhow!("File size {request_len} exceeds max {max_len}"),
"uploading",
));
}
let cancel = state.cancel.clone();
let fun = async || {
let stream = bytes_to_stream(bytes.clone());
state
.storage
.upload(stream, request_len, &path, None, &cancel)
.await
};
retry(
fun,
TimeoutOrCancel::caused_by_cancel,
WARN_THRESHOLD,
MAX_RETRIES,
"uploading",
&cancel,
)
.await
.unwrap_or(Err(anyhow!("uploading cancelled")))
.map_err(|e| internal_error(e, path, "reading response"))?;
Ok(ok())
}
async fn delete(S3Path { path }: S3Path, state: State) -> Result {
info!(%path, "deleting");
let cancel = state.cancel.clone();
retry(
async || state.storage.delete(&path, &cancel).await,
TimeoutOrCancel::caused_by_cancel,
WARN_THRESHOLD,
MAX_RETRIES,
"deleting",
&cancel,
)
.await
.unwrap_or(Err(anyhow!("deleting cancelled")))
.map_err(|e| internal_error(e, path, "deleting"))?;
Ok(ok())
}
async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
info!(%path, "deleting prefix");
let cancel = state.cancel.clone();
retry(
async || state.storage.delete_prefix(&path, &cancel).await,
TimeoutOrCancel::caused_by_cancel,
WARN_THRESHOLD,
MAX_RETRIES,
"deleting prefix",
&cancel,
)
.await
.unwrap_or(Err(anyhow!("deleting prefix cancelled")))
.map_err(|e| internal_error(e, path, "deleting prefix"))?;
Ok(ok())
}
pub async fn check_storage_permissions(
client: &GenericRemoteStorage,
cancel: CancellationToken,
) -> anyhow::Result<()> {
info!("storage permissions check");
// as_nanos() as multiple instances proxying same bucket may be started at once
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_nanos()
.to_string();
let path = RemotePath::from_string(&format!("write_access_{now}"))?;
info!(%path, "uploading");
let body = now.to_string();
let stream = bytes_to_stream(Bytes::from(body.clone()));
client
.upload(stream, body.len(), &path, None, &cancel)
.await?;
use tokio::io::AsyncReadExt;
info!(%path, "downloading");
let download_opts = DownloadOpts {
kind: remote_storage::DownloadKind::Small,
..Default::default()
};
let mut body_read_buf = Vec::new();
let stream = client
.download(&path, &download_opts, &cancel)
.await?
.download_stream;
tokio_util::io::StreamReader::new(stream)
.read_to_end(&mut body_read_buf)
.await?;
let body_read = String::from_utf8(body_read_buf)?;
if body != body_read {
error!(%body, %body_read, "File contents do not match");
anyhow::bail!("Read back file doesn't match original")
}
info!(%path, "removing");
client.delete(&path, &cancel).await
}
fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
futures::stream::once(futures::future::ready(Ok(bytes)))
}
#[cfg(test)]
mod tests {
use super::*;
use axum::{body::Body, extract::Request, response::Response};
use http_body_util::BodyExt;
use itertools::iproduct;
use std::env::var;
use std::sync::Arc;
use std::time::Duration;
use test_log::test as testlog;
use tower::{Service, util::ServiceExt};
use utils::id::{TenantId, TimelineId};
// see libs/remote_storage/tests/test_real_s3.rs
const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
let cancel = CancellationToken::new();
let (dir, storage) = if var(REAL_S3_ENV).is_err() {
// tests execute in parallel and we need a new directory for each of them
let dir = camino_tempfile::tempdir().unwrap();
let fs =
remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
(Some(dir), GenericRemoteStorage::LocalFs(fs))
} else {
// test_real_s3::create_s3_client is hard to reference, reimplementing here
let millis = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis();
use rand::Rng;
let random = rand::thread_rng().r#gen::<u32>();
let s3_config = remote_storage::S3Config {
bucket_name: var(REAL_S3_BUCKET).unwrap(),
bucket_region: var(REAL_S3_REGION).unwrap(),
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
endpoint: None,
concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: None,
upload_storage_class: None,
};
let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
.await
.unwrap();
(None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
};
let proxy = Storage {
auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
storage,
cancel: cancel.clone(),
max_upload_file_limit: usize::MAX,
};
check_storage_permissions(&proxy.storage, cancel)
.await
.unwrap();
(proxy, dir)
}
// see libs/utils/src/auth.rs
const TEST_PUB_KEY_ED25519: &[u8] = b"
-----BEGIN PUBLIC KEY-----
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
-----END PUBLIC KEY-----
";
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
-----BEGIN PRIVATE KEY-----
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
-----END PRIVATE KEY-----
"#;
async fn request(req: Request<Body>) -> Response<Body> {
let (proxy, _) = proxy().await;
app(Arc::new(proxy))
.into_service()
.oneshot(req)
.await
.unwrap()
}
#[testlog(tokio::test)]
async fn status() {
let res = Request::builder()
.uri("/status")
.body(Body::empty())
.map(request)
.unwrap()
.await;
assert_eq!(res.status(), StatusCode::OK);
}
fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
iproduct!(
vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
vec!["GET", "PUT", "DELETE"]
)
}
#[testlog(tokio::test)]
async fn no_token() {
for (uri, method) in routes() {
info!(%uri, %method);
let res = Request::builder()
.uri(uri)
.method(method)
.body(Body::empty())
.map(request)
.unwrap()
.await;
assert!(matches!(
res.status(),
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
));
}
}
#[testlog(tokio::test)]
async fn invalid_token() {
for (uri, method) in routes() {
info!(%uri, %method);
let status = Request::builder()
.uri(uri)
.header("Authorization", "Bearer 123")
.method(method)
.body(Body::empty())
.map(request)
.unwrap()
.await;
assert!(matches!(
status.status(),
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
));
}
}
const TENANT_ID: TenantId =
TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
const TIMELINE_ID: TimelineId =
TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
fn token() -> String {
let claims = object_storage::Claims {
tenant_id: TENANT_ID,
timeline_id: TIMELINE_ID,
endpoint_id: ENDPOINT_ID.into(),
exp: u64::MAX,
};
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
jsonwebtoken::encode(&header, &claims, &key).unwrap()
}
#[testlog(tokio::test)]
async fn unauthorized() {
let (proxy, _) = proxy().await;
let mut app = app(Arc::new(proxy)).into_service();
let token = token();
let args = itertools::iproduct!(
vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
vec![ENDPOINT_ID, "ep-ololo"]
)
.skip(1);
for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
info!(%uri, %method, %tenant, %timeline, %endpoint);
let request = Request::builder()
.uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
.method(method)
.header("Authorization", format!("Bearer {}", token))
.body(Body::empty())
.unwrap();
let status = ServiceExt::ready(&mut app)
.await
.unwrap()
.call(request)
.await
.unwrap()
.status();
assert_eq!(status, StatusCode::UNAUTHORIZED);
}
}
#[testlog(tokio::test)]
async fn method_not_allowed() {
let token = token();
let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
for (key, method) in iter {
let status = Request::builder()
.uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
.method(method)
.header("Authorization", format!("Bearer {token}"))
.body(Body::empty())
.map(request)
.unwrap()
.await
.status();
assert!(matches!(
status,
StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
));
}
}
async fn requests_chain(
chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
token: impl Fn(&str) -> String,
) {
let (proxy, _) = proxy().await;
let mut app = app(Arc::new(proxy)).into_service();
for (uri, method, body, expected_status, compare_body) in chain {
info!(%uri, %method, %body, %expected_status);
let bearer = format!("Bearer {}", token(&uri));
let request = Request::builder()
.uri(uri)
.method(method)
.header("Authorization", &bearer)
.body(Body::from(body))
.unwrap();
let response = ServiceExt::ready(&mut app)
.await
.unwrap()
.call(request)
.await
.unwrap();
assert_eq!(response.status(), expected_status);
if !compare_body {
continue;
}
let read_body = response.into_body().collect().await.unwrap().to_bytes();
assert_eq!(body, read_body);
}
}
#[testlog(tokio::test)]
async fn metrics() {
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
let req = vec![
(uri.clone(), "PUT", "body", StatusCode::OK, false),
(uri.clone(), "DELETE", "", StatusCode::OK, false),
];
requests_chain(req.into_iter(), |_| token()).await;
let res = Request::builder()
.uri("/metrics")
.body(Body::empty())
.map(request)
.unwrap()
.await;
assert_eq!(res.status(), StatusCode::OK);
let body = res.into_body().collect().await.unwrap().to_bytes();
let body = String::from_utf8_lossy(&body);
tracing::debug!(%body);
// Storage metrics are not gathered for LocalFs
if var(REAL_S3_ENV).is_ok() {
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
}
assert!(body.contains("process_threads"));
}
#[testlog(tokio::test)]
async fn insert_retrieve_remove() {
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
let chain = vec![
(uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
(uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
(uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
(uri.clone(), "DELETE", "", StatusCode::OK, false),
(uri, "GET", "", StatusCode::NOT_FOUND, false),
];
requests_chain(chain.into_iter(), |_| token()).await;
}
fn delete_prefix_token(uri: &str) -> String {
use serde::Serialize;
let parts = uri.split("/").collect::<Vec<&str>>();
#[derive(Serialize)]
struct PrefixClaims {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
endpoint_id: Option<object_storage::EndpointId>,
exp: u64,
}
let claims = PrefixClaims {
tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
endpoint_id: parts.get(3).map(ToString::to_string),
exp: u64::MAX,
};
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
jsonwebtoken::encode(&header, &claims, &key).unwrap()
}
// Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
#[testlog(tokio::test)]
async fn delete_prefix() {
let tenant_id =
TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
// Why extra slash in string literals? Axum is weird with URIs:
// /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
// as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
// The cost of removing trailing slash is suprisingly hard:
// * Add tower dependency with NormalizePath layer
// * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
// * Rewrite make_service() -> into_make_service()
// * Rewrite oneshot() (not available for NormalizePath)
// I didn't manage to get it working correctly
let chain = vec![
// create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
(f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
// create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
// create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
(f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
// create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
(f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
// create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
(f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
(f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
// delete prefix 1 -> empty
(format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
(f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
];
requests_chain(chain.into_iter(), delete_prefix_token).await;
}
}

View File

@@ -1,344 +0,0 @@
use anyhow::Result;
use axum::extract::{FromRequestParts, Path};
use axum::response::{IntoResponse, Response};
use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
use axum_extra::TypedHeader;
use axum_extra::headers::{Authorization, authorization::Bearer};
use camino::Utf8PathBuf;
use jsonwebtoken::{DecodingKey, Validation};
use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::{Deserialize, Serialize};
use std::fmt::Display;
use std::result::Result as StdResult;
use std::sync::Arc;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error};
use utils::id::{TenantId, TimelineId};
// simplified version of utils::auth::JwtAuth
pub struct JwtAuth {
decoding_key: DecodingKey,
validation: Validation,
}
pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
impl JwtAuth {
pub fn new(key: &[u8]) -> Result<Self> {
Ok(Self {
decoding_key: DecodingKey::from_ed_pem(key)?,
validation: Validation::new(VALIDATION_ALGO),
})
}
pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
}
}
fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
let key = clean_utf8(&Utf8PathBuf::from(key));
if key.starts_with("..") || key == "." || key == "/" {
return Err(format!("invalid key {key}"));
}
match key.strip_prefix("/").map(Utf8PathBuf::from) {
Ok(p) => Ok(p),
_ => Ok(key),
}
}
// Copied from path_clean crate with PathBuf->Utf8PathBuf
fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
use camino::Utf8Component as Comp;
let mut out = Vec::new();
for comp in path.components() {
match comp {
Comp::CurDir => (),
Comp::ParentDir => match out.last() {
Some(Comp::RootDir) => (),
Some(Comp::Normal(_)) => {
out.pop();
}
None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
out.push(comp)
}
},
comp => out.push(comp),
}
}
if !out.is_empty() {
out.iter().collect()
} else {
Utf8PathBuf::from(".")
}
}
pub struct Storage {
pub auth: JwtAuth,
pub storage: GenericRemoteStorage,
pub cancel: CancellationToken,
pub max_upload_file_limit: usize,
}
pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
#[derive(Deserialize, Serialize, PartialEq)]
pub struct Claims {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub endpoint_id: EndpointId,
pub exp: u64,
}
impl Display for Claims {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
)
}
}
#[derive(Deserialize, Serialize)]
struct KeyRequest {
tenant_id: TenantId,
timeline_id: TimelineId,
endpoint_id: EndpointId,
path: String,
}
#[derive(Debug, PartialEq)]
pub struct S3Path {
pub path: RemotePath,
}
impl TryFrom<&KeyRequest> for S3Path {
type Error = String;
fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
let KeyRequest {
tenant_id,
timeline_id,
endpoint_id,
path,
} = &req;
let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
Ok(S3Path { path })
}
}
fn unauthorized(route: impl Display, claims: impl Display) -> Response {
debug!(%route, %claims, "route doesn't match claims");
StatusCode::UNAUTHORIZED.into_response()
}
pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
debug!(%err, desc);
(StatusCode::BAD_REQUEST, err.to_string()).into_response()
}
pub fn ok() -> Response {
StatusCode::OK.into_response()
}
pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
error!(%err, %path, desc);
StatusCode::INTERNAL_SERVER_ERROR.into_response()
}
pub fn not_found(key: impl ToString) -> Response {
(StatusCode::NOT_FOUND, key.to_string()).into_response()
}
impl FromRequestParts<Arc<Storage>> for S3Path {
type Rejection = Response;
async fn from_request_parts(
parts: &mut Parts,
state: &Arc<Storage>,
) -> Result<Self, Self::Rejection> {
let Path(path): Path<KeyRequest> = parts
.extract()
.await
.map_err(|e| bad_request(e, "invalid route"))?;
let TypedHeader(Authorization(bearer)) = parts
.extract::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|e| bad_request(e, "invalid token"))?;
let claims: Claims = state
.auth
.decode(bearer.token())
.map_err(|e| bad_request(e, "decoding token"))?;
let route = Claims {
tenant_id: path.tenant_id,
timeline_id: path.timeline_id,
endpoint_id: path.endpoint_id.clone(),
exp: claims.exp,
};
if route != claims {
return Err(unauthorized(route, claims));
}
(&path)
.try_into()
.map_err(|e| bad_request(e, "invalid route"))
}
}
#[derive(Deserialize, Serialize, PartialEq)]
pub struct PrefixKeyPath {
pub tenant_id: TenantId,
pub timeline_id: Option<TimelineId>,
pub endpoint_id: Option<EndpointId>,
}
impl Display for PrefixKeyPath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
self.tenant_id,
self.timeline_id
.as_ref()
.map(ToString::to_string)
.unwrap_or("".to_string()),
self.endpoint_id
.as_ref()
.map(ToString::to_string)
.unwrap_or("".to_string())
)
}
}
#[derive(Debug, PartialEq)]
pub struct PrefixS3Path {
pub path: RemotePath,
}
impl From<&PrefixKeyPath> for PrefixS3Path {
fn from(path: &PrefixKeyPath) -> Self {
let timeline_id = path
.timeline_id
.as_ref()
.map(ToString::to_string)
.unwrap_or("".to_string());
let endpoint_id = path
.endpoint_id
.as_ref()
.map(ToString::to_string)
.unwrap_or("".to_string());
let path = Utf8PathBuf::from(path.tenant_id.to_string())
.join(timeline_id)
.join(endpoint_id);
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
PrefixS3Path { path }
}
}
impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
type Rejection = Response;
async fn from_request_parts(
parts: &mut Parts,
state: &Arc<Storage>,
) -> Result<Self, Self::Rejection> {
let Path(path) = parts
.extract::<Path<PrefixKeyPath>>()
.await
.map_err(|e| bad_request(e, "invalid route"))?;
let TypedHeader(Authorization(bearer)) = parts
.extract::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|e| bad_request(e, "invalid token"))?;
let claims: PrefixKeyPath = state
.auth
.decode(bearer.token())
.map_err(|e| bad_request(e, "invalid token"))?;
if path != claims {
return Err(unauthorized(path, claims));
}
Ok((&path).into())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_key() {
let f = super::normalize_key;
assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
assert_eq!(
f("ololo/1/../../not_ololo").unwrap(),
Utf8PathBuf::from("not_ololo")
);
assert!(f("ololo/1/../../../").is_err());
assert!(f(".").is_err());
assert!(f("../").is_err());
assert!(f("").is_err());
assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
assert!(f("/1/2/3/../../../").is_err());
assert!(f("/1/2/3/../../../../").is_err());
}
const TENANT_ID: TenantId =
TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
const TIMELINE_ID: TimelineId =
TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
#[test]
fn s3_path() {
let auth = Claims {
tenant_id: TENANT_ID,
timeline_id: TIMELINE_ID,
endpoint_id: ENDPOINT_ID.into(),
exp: u64::MAX,
};
let s3_path = |key| {
let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
let path = RemotePath::from_string(path).unwrap();
S3Path { path }
};
let path = "cache_key".to_string();
let mut key_path = KeyRequest {
path,
tenant_id: auth.tenant_id,
timeline_id: auth.timeline_id,
endpoint_id: auth.endpoint_id,
};
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
key_path.path = "we/can/have/nested/paths".to_string();
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
key_path.path = "../error/hello/../".to_string();
assert!(S3Path::try_from(&key_path).is_err());
}
#[test]
fn prefix_s3_path() {
let mut path = PrefixKeyPath {
tenant_id: TENANT_ID,
timeline_id: None,
endpoint_id: None,
};
let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
assert_eq!(
PrefixS3Path::from(&path).path,
prefix_path(format!("{TENANT_ID}"))
);
path.timeline_id = Some(TIMELINE_ID);
assert_eq!(
PrefixS3Path::from(&path).path,
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
);
path.endpoint_id = Some(ENDPOINT_ID.into());
assert_eq!(
PrefixS3Path::from(&path).path,
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
);
}
}

View File

@@ -1,65 +0,0 @@
//! `object_storage` is a service which provides API for uploading and downloading
//! files. It is used by compute and control plane for accessing LFC prewarm data.
//! This service is deployed either as a separate component or as part of compute image
//! for large computes.
mod app;
use anyhow::Context;
use tracing::info;
use utils::logging;
//see set()
const fn max_upload_file_limit() -> usize {
100 * 1024 * 1024
}
#[derive(serde::Deserialize)]
#[serde(tag = "type")]
struct Config {
listen: std::net::SocketAddr,
pemfile: camino::Utf8PathBuf,
#[serde(flatten)]
storage_config: remote_storage::RemoteStorageConfig,
#[serde(default = "max_upload_file_limit")]
max_upload_file_limit: usize,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
logging::init(
logging::LogFormat::Plain,
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)?;
let config: String = std::env::args().skip(1).take(1).collect();
if config.is_empty() {
anyhow::bail!("Usage: object_storage config.json")
}
info!("Reading config from {config}");
let config = std::fs::read_to_string(config.clone())?;
let config: Config = serde_json::from_str(&config).context("parsing config")?;
info!("Reading pemfile from {}", config.pemfile.clone());
let pemfile = std::fs::read(config.pemfile.clone())?;
info!("Loading public key from {}", config.pemfile.clone());
let auth = object_storage::JwtAuth::new(&pemfile)?;
let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
info!("listening on {}", listener.local_addr().unwrap());
let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
let cancel = tokio_util::sync::CancellationToken::new();
app::check_storage_permissions(&storage, cancel.clone()).await?;
let proxy = std::sync::Arc::new(object_storage::Storage {
auth,
storage,
cancel: cancel.clone(),
max_upload_file_limit: config.max_upload_file_limit,
});
tokio::spawn(utils::signals::signal_handler(cancel.clone()));
axum::serve(listener, app::app(proxy))
.with_graceful_shutdown(async move { cancel.cancelled().await })
.await?;
Ok(())
}

View File

@@ -10,8 +10,6 @@ default = []
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
fuzz-read-path = ["testing"]
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true

View File

@@ -126,7 +126,7 @@ async fn ingest(
max_concurrency: NonZeroUsize::new(1).unwrap(),
});
let (_desc, path) = layer
.write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
.write_to_disk(&ctx, None, l0_flush_state.inner())
.await?
.unwrap();
tokio::fs::remove_file(path).await?;

View File

@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use once_cell::sync::Lazy;
use pageserver::config::PageServerConf;
use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
use pageserver::walredo::PostgresRedoManager;
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::TenantShardId;
@@ -223,14 +223,7 @@ impl Request {
// TODO: avoid these clones
manager
.request_redo(
*key,
*lsn,
base_img.clone(),
records.clone(),
*pg_version,
RedoAttemptType::ReadPage,
)
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
.await
.context("request_redo")
}

View File

@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
#[derive(Debug, thiserror::Error)]
@@ -353,10 +353,9 @@ where
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
let blocks = self
.timeline
.get_vectored(query, self.io_concurrency.clone(), self.ctx)
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
.await?;
for (key, block) in blocks {

View File

@@ -31,6 +31,7 @@ use pageserver::{
};
use postgres_backend::AuthType;
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -452,24 +453,6 @@ fn start_pageserver(
info!("Using auth for http API: {:#?}", conf.http_auth_type);
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
{
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
"main",
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
Some(Arc::new(server_config))
} else {
None
};
match var("NEON_AUTH_TOKEN") {
Ok(v) => {
info!("Loaded JWT token for authentication with Safekeeper");
@@ -688,11 +671,17 @@ fn start_pageserver(
let https_task = match https_listener {
Some(https_listener) => {
let tls_server_config = tls_server_config
.clone()
.expect("tls_server_config is set earlier if https is enabled");
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
let server =
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -748,11 +737,6 @@ fn start_pageserver(
tokio::net::TcpListener::from_std(pageserver_listener)
.context("create tokio listener")?
},
if conf.enable_tls_page_service_api {
tls_server_config
} else {
None
},
);
// All started up! Now just sit and wait for shutdown signal.
@@ -760,7 +744,32 @@ fn start_pageserver(
let signal_token = CancellationToken::new();
let signal_cancel = signal_token.child_token();
tokio::spawn(utils::signals::signal_handler(signal_token));
// Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
// https://github.com/neondatabase/neon/issues/9740.
tokio::spawn(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
loop {
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
std::process::exit(111);
}
_ = sigint.recv() => "SIGINT",
_ = sigterm.recv() => "SIGTERM",
};
if !signal_token.is_cancelled() {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
signal_token.cancel();
} else {
info!("Got signal {signal}. Already shutting down.");
}
}
});
// Wait for cancellation signal and shut down the pageserver.
//

View File

@@ -219,11 +219,6 @@ pub struct PageServerConf {
pub generate_unarchival_heatmap: bool,
pub tracing: Option<pageserver_api::config::Tracing>,
/// Enable TLS in page service API.
/// Does not force TLS: the client negotiates TLS usage during the handshake.
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
pub enable_tls_page_service_api: bool,
}
/// Token for authentication to safekeepers
@@ -396,7 +391,6 @@ impl PageServerConf {
load_previous_heatmap,
generate_unarchival_heatmap,
tracing,
enable_tls_page_service_api,
} = config_toml;
let mut conf = PageServerConf {
@@ -447,7 +441,6 @@ impl PageServerConf {
page_service_pipelining,
get_vectored_concurrent_io,
tracing,
enable_tls_page_service_api,
// ------------------------------------------------------------
// fields that require additional validation or custom handling

View File

@@ -212,12 +212,6 @@ paths:
schema:
type: string
format: date-time
"412":
description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
content:
application/json:
schema:
$ref: "#/components/schemas/PreconditionFailedError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
parameters:

View File

@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
};
use crate::tenant::remote_timeline_client::index::GcCompactionState;
use crate::tenant::remote_timeline_client::{
download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
download_index_part, list_remote_tenant_shards, list_remote_timelines,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
if !tenant_shard_id.is_shard_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Lsn calculations by timestamp are only available on shard zero"
"Size calculations are only available on shard zero"
)));
}
@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
if !tenant_shard_id.is_shard_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Timestamp calculations by lsn are only available on shard zero"
"Size calculations are only available on shard zero"
)));
}
@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
.to_string();
json_response(StatusCode::OK, time)
}
None => Err(ApiError::PreconditionFailed(
format!("Timestamp for lsn {} not found", lsn).into(),
None => Err(ApiError::NotFound(
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
)),
}
}
@@ -2274,7 +2274,6 @@ async fn timeline_compact_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
flags |= CompactFlags::DryRun;
}
// Manual compaction does not yield for L0.
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -2912,22 +2911,9 @@ async fn tenant_scan_remote_handler(
};
}
let result =
download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
.instrument(info_span!("download_tenant_manifest",
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug()))
.await;
let stripe_size = match result {
Ok((manifest, _, _)) => manifest.stripe_size,
Err(DownloadError::NotFound) => None,
Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
};
response.shards.push(TenantScanRemoteStorageShard {
tenant_shard_id,
generation: generation.into(),
stripe_size,
});
}
@@ -3253,7 +3239,7 @@ async fn ingest_aux_files(
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
.map_err(ApiError::InternalServerError)?;
}
modification
.commit(&ctx)
@@ -3382,11 +3368,11 @@ async fn put_tenant_timeline_import_basebackup(
let broker_client = state.broker_client.clone();
let mut body = StreamReader::new(
request
.into_body()
.map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
);
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
})
}));
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -3460,7 +3446,7 @@ async fn put_tenant_timeline_import_wal(
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
std::io::Error::other( anyhow::anyhow!(error))
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
})
}));

View File

@@ -27,7 +27,7 @@ use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::{WalIngest, WalIngestErrorKind};
use crate::walingest::WalIngest;
// Returns checkpoint LSN from controlfile
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
.put_rel_creation(rel, nblocks as u32, ctx)
.await
{
match e.kind {
WalIngestErrorKind::RelationAlreadyExists(rel) => {
debug!("Relation {rel} already exists. We must be extending it.")
match e {
RelationError::AlreadyExists => {
debug!("Relation {} already exist. We must be extending it.", rel)
}
_ => return Err(e.into()),
}

View File

@@ -17,7 +17,7 @@ use metrics::{
use once_cell::sync::Lazy;
use pageserver_api::config::{
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
@@ -1714,28 +1714,6 @@ pub enum SmgrQueryType {
Test,
}
#[derive(
Debug,
Clone,
Copy,
IntoStaticStr,
strum_macros::EnumCount,
strum_macros::EnumIter,
strum_macros::FromRepr,
enum_map::Enum,
)]
#[strum(serialize_all = "snake_case")]
pub enum GetPageBatchBreakReason {
BatchFull,
NonBatchableRequest,
NonUniformLsn,
SamePageAtDifferentLsn,
NonUniformTimeline,
ExecutorSteal,
#[cfg(feature = "testing")]
NonUniformKey,
}
pub(crate) struct SmgrQueryTimePerTimeline {
global_started: [IntCounter; SmgrQueryType::COUNT],
global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1747,8 +1725,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros: IntCounter,
global_batch_wait_time: Histogram,
per_timeline_batch_wait_time: Histogram,
global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
throttling: Arc<tenant_throttling::Pagestream>,
}
@@ -1882,55 +1858,12 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
.expect("failed to define a metric")
});
static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_page_service_batch_break_reason_global",
"Reason for breaking batches of get page requests",
&["reason"],
)
.expect("failed to define a metric")
});
struct GetPageBatchBreakReasonTimelineMetrics {
map: EnumMap<GetPageBatchBreakReason, IntCounter>,
}
impl GetPageBatchBreakReasonTimelineMetrics {
fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
GetPageBatchBreakReasonTimelineMetrics {
map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
let reason = GetPageBatchBreakReason::from_usize(reason_idx);
PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
tenant_id,
shard_slug,
timeline_id,
reason.into(),
])
})),
}
}
fn inc(&self, reason: GetPageBatchBreakReason) {
self.map[reason].inc()
}
}
static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_service_batch_break_reason",
"Reason for breaking batches of get page requests",
&["tenant_id", "shard_id", "timeline_id", "reason"],
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_page_service_config_max_batch_size",
"Configured maximum batch size for the server-side batching functionality of page_service. \
Labels expose more of the configuration parameters.",
&["mode", "execution", "batching"]
&["mode", "execution"]
)
.expect("failed to define a metric")
});
@@ -1938,11 +1871,10 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
let (label_values, value) = match conf {
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
max_batch_size,
execution,
batching,
}) => {
let mode = "pipelined";
let execution = match execution {
@@ -1951,12 +1883,7 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
}
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
};
let batching = match batching {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
};
([mode, execution, batching], max_batch_size.get())
([mode, execution], max_batch_size.get())
}
};
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
@@ -2052,15 +1979,6 @@ impl SmgrQueryTimePerTimeline {
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
let global_batch_break_reason = std::array::from_fn(|i| {
let reason = GetPageBatchBreakReason::from_usize(i);
PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
.get_metric_with_label_values(&[reason.into()])
.unwrap()
});
let per_timeline_batch_break_reason =
GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
let global_flush_in_progress_micros =
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -2078,8 +1996,6 @@ impl SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros,
global_batch_wait_time,
per_timeline_batch_wait_time,
global_batch_break_reason,
per_timeline_batch_break_reason,
throttling: pagestream_throttle_metrics,
}
}
@@ -2108,16 +2024,9 @@ impl SmgrQueryTimePerTimeline {
}
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
pub(crate) fn observe_getpage_batch_start(
&self,
batch_size: usize,
break_reason: GetPageBatchBreakReason,
) {
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
self.global_batch_size.observe(batch_size as f64);
self.per_timeline_batch_size.observe(batch_size as f64);
self.global_batch_break_reason[break_reason.into_usize()].inc();
self.per_timeline_batch_break_reason.inc(break_reason);
}
}
@@ -3483,15 +3392,6 @@ impl TimelineMetrics {
shard_id,
timeline_id,
]);
for reason in GetPageBatchBreakReason::iter() {
let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
reason.into(),
]);
}
}
}
@@ -4370,7 +4270,6 @@ pub fn preinitialize_metrics(
[
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
&SMGR_QUERY_STARTED_GLOBAL,
&PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
]
.into_iter()
.for_each(|c| {

View File

@@ -18,7 +18,7 @@ use itertools::Itertools;
use once_cell::sync::OnceCell;
use pageserver_api::config::{
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::models::{
@@ -58,8 +58,8 @@ use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
TimelineMetrics,
};
use crate::pgdatadir_mapping::Version;
use crate::span::{
@@ -105,7 +105,6 @@ pub fn spawn(
pg_auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -125,7 +124,6 @@ pub fn spawn(
perf_trace_dispatch,
tcp_listener,
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
libpq_ctx,
cancel.clone(),
@@ -183,7 +181,6 @@ pub async fn libpq_listener_main(
perf_trace_dispatch: Option<Dispatch>,
listener: tokio::net::TcpListener,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
@@ -226,7 +223,6 @@ pub async fn libpq_listener_main(
local_auth,
socket,
auth_type,
tls_config.clone(),
pipelining_config.clone(),
connection_ctx,
connections_cancel.child_token(),
@@ -268,7 +264,6 @@ async fn page_service_conn_main(
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
connection_ctx: RequestContext,
cancel: CancellationToken,
@@ -339,8 +334,7 @@ async fn page_service_conn_main(
cancel.clone(),
gate_guard,
);
let pgbackend =
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
match pgbackend.run(&mut conn_handler, &cancel).await {
Ok(()) => {
@@ -641,7 +635,6 @@ impl std::fmt::Display for BatchedPageStreamError {
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
effective_request_lsn: Lsn,
ctx: RequestContext,
}
@@ -671,8 +664,8 @@ enum BatchedFeMessage {
GetPage {
span: Span,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
batch_break_reason: GetPageBatchBreakReason,
},
DbSize {
span: Span,
@@ -725,119 +718,6 @@ impl BatchedFeMessage {
BatchedFeMessage::RespondError { .. } => {}
}
}
fn should_break_batch(
&self,
other: &BatchedFeMessage,
max_batch_size: NonZeroUsize,
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
) -> Option<GetPageBatchBreakReason> {
match (self, other) {
(
BatchedFeMessage::GetPage {
shard: accum_shard,
pages: accum_pages,
..
},
BatchedFeMessage::GetPage {
shard: this_shard,
pages: this_pages,
..
},
) => {
assert_eq!(this_pages.len(), 1);
if accum_pages.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_pages.len(), max_batch_size.get());
return Some(GetPageBatchBreakReason::BatchFull);
}
if !accum_shard.is_same_handle_as(this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return Some(GetPageBatchBreakReason::NonUniformTimeline);
}
match batching_strategy {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
if let Some(last_in_batch) = accum_pages.last() {
if last_in_batch.effective_request_lsn
!= this_pages[0].effective_request_lsn
{
trace!(
accum_lsn = %last_in_batch.effective_request_lsn,
this_lsn = %this_pages[0].effective_request_lsn,
"stopping batching because LSN changed"
);
return Some(GetPageBatchBreakReason::NonUniformLsn);
}
}
}
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
// The read path doesn't curently support serving the same page at different LSNs.
// While technically possible, it's uncertain if the complexity is worth it.
// Break the batch if such a case is encountered.
let same_page_different_lsn = accum_pages.iter().any(|batched| {
batched.req.rel == this_pages[0].req.rel
&& batched.req.blkno == this_pages[0].req.blkno
&& batched.effective_request_lsn
!= this_pages[0].effective_request_lsn
});
if same_page_different_lsn {
trace!(
rel=%this_pages[0].req.rel,
blkno=%this_pages[0].req.blkno,
lsn=%this_pages[0].effective_request_lsn,
"stopping batching because same page was requested at different LSNs"
);
return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
}
}
}
None
}
#[cfg(feature = "testing")]
(
BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
},
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) => {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return Some(GetPageBatchBreakReason::BatchFull);
}
if !accum_shard.is_same_handle_as(this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return Some(GetPageBatchBreakReason::NonUniformTimeline);
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return Some(GetPageBatchBreakReason::NonUniformKey);
}
None
}
(_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
}
}
}
impl PageServerHandler {
@@ -1139,32 +1019,34 @@ impl PageServerHandler {
.await?;
// We're holding the Handle
let effective_request_lsn = match Self::effective_request_lsn(
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
let res = Self::wait_or_get_last_lsn(
&shard,
shard.get_last_record_lsn(),
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
) {
&ctx,
)
.maybe_perf_instrument(&ctx, |current_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: current_perf_span,
"WAIT_LSN",
)
})
.await;
let effective_request_lsn = match res {
Ok(lsn) => lsn,
Err(e) => {
return respond_error!(span, e);
}
};
BatchedFeMessage::GetPage {
span,
shard: shard.downgrade(),
pages: smallvec::smallvec![BatchedGetPageRequest {
req,
timer,
effective_request_lsn,
ctx,
}],
// The executor grabs the batch when it becomes idle.
// Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
// default reason for breaking the batch.
batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
effective_request_lsn,
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
}
}
#[cfg(feature = "testing")]
@@ -1190,7 +1072,6 @@ impl PageServerHandler {
#[instrument(skip_all, level = tracing::Level::TRACE)]
#[allow(clippy::boxed_local)]
fn pagestream_do_batch(
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
max_batch_size: NonZeroUsize,
batch: &mut Result<BatchedFeMessage, QueryError>,
this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1202,58 +1083,89 @@ impl PageServerHandler {
Err(e) => return Err(Err(e)),
};
let eligible_batch = match batch {
Ok(b) => b,
Err(_) => {
return Err(Ok(this_msg));
}
};
let batch_break =
eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
match batch_break {
Some(reason) => {
if let BatchedFeMessage::GetPage {
batch_break_reason, ..
} = eligible_batch
{
*batch_break_reason = reason;
match (&mut *batch, this_msg) {
// something batched already, let's see if we can add this message to the batch
(
Ok(BatchedFeMessage::GetPage {
span: _,
shard: accum_shard,
pages: accum_pages,
effective_request_lsn: accum_lsn,
}),
BatchedFeMessage::GetPage {
span: _,
shard: this_shard,
pages: this_pages,
effective_request_lsn: this_lsn,
},
) if (|| {
assert_eq!(this_pages.len(), 1);
if accum_pages.len() >= max_batch_size.get() {
trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_pages.len(), max_batch_size.get());
return false;
}
Err(Ok(this_msg))
}
None => {
if !accum_shard.is_same_handle_as(&this_shard) {
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
// the vectored get currently only supports a single LSN, so, bounce as soon
// as the effective request_lsn changes
if *accum_lsn != this_lsn {
trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
return false;
}
true
})() =>
{
// ok to batch
match (eligible_batch, this_msg) {
(
BatchedFeMessage::GetPage {
pages: accum_pages, ..
},
BatchedFeMessage::GetPage {
pages: this_pages, ..
},
) => {
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
BatchedFeMessage::Test {
requests: accum_requests,
..
},
BatchedFeMessage::Test {
requests: this_requests,
..
},
) => {
accum_requests.extend(this_requests);
Ok(())
}
// Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
_ => unreachable!(),
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
Ok(BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
}),
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) if (|| {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return false;
}
if !accum_shard.is_same_handle_as(&this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return false;
}
true
})() =>
{
// ok to batch
accum_requests.extend(this_requests);
Ok(())
}
// something batched already but this message is unbatchable
(_, this_msg) => {
// by default, don't continue batching
Err(Ok(this_msg))
}
}
}
@@ -1475,8 +1387,8 @@ impl PageServerHandler {
BatchedFeMessage::GetPage {
span,
shard,
effective_request_lsn,
pages,
batch_break_reason,
} => {
fail::fail_point!("ps::handle-pagerequest-message::getpage");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
@@ -1487,9 +1399,9 @@ impl PageServerHandler {
let res = self
.handle_get_page_at_lsn_request_batched(
&shard,
effective_request_lsn,
pages,
io_concurrency,
batch_break_reason,
&ctx,
)
.instrument(span.clone())
@@ -1806,7 +1718,6 @@ impl PageServerHandler {
let PageServicePipeliningConfigPipelined {
max_batch_size,
execution,
batching: batching_strategy,
} = pipelining_config;
// Macro to _define_ a pipeline stage.
@@ -1858,7 +1769,7 @@ impl PageServerHandler {
exit |= read_res.is_err();
let could_send = batch_tx
.send(read_res, |batch, res| {
Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
Self::pagestream_do_batch(max_batch_size, batch, res)
})
.await;
exit |= could_send.is_err();
@@ -1954,39 +1865,7 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<Lsn, PageStreamError> {
let last_record_lsn = timeline.get_last_record_lsn();
let effective_request_lsn = Self::effective_request_lsn(
timeline,
last_record_lsn,
request_lsn,
not_modified_since,
latest_gc_cutoff_lsn,
)?;
if effective_request_lsn > last_record_lsn {
timeline
.wait_lsn(
not_modified_since,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
ctx,
)
.await?;
// Since we waited for 'effective_request_lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
}
Ok(effective_request_lsn)
}
fn effective_request_lsn(
timeline: &Timeline,
last_record_lsn: Lsn,
request_lsn: Lsn,
not_modified_since: Lsn,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<Lsn, PageStreamError> {
// Sanity check the request
if request_lsn < not_modified_since {
return Err(PageStreamError::BadRequest(
@@ -2021,7 +1900,19 @@ impl PageServerHandler {
}
}
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
if not_modified_since > last_record_lsn {
timeline
.wait_lsn(
not_modified_since,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
ctx,
)
.await?;
// Since we waited for 'not_modified_since' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
Ok(not_modified_since)
} else {
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2176,16 +2067,16 @@ impl PageServerHandler {
async fn handle_get_page_at_lsn_request_batched(
&mut self,
timeline: &Timeline,
effective_lsn: Lsn,
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
io_concurrency: IoConcurrency,
batch_break_reason: GetPageBatchBreakReason,
ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
debug_assert_current_span_has_tenant_and_timeline_id();
timeline
.query_metrics
.observe_getpage_batch_start(requests.len(), batch_break_reason);
.observe_getpage_batch_start(requests.len());
// If a page trace is running, submit an event for this request.
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
@@ -2195,81 +2086,20 @@ impl PageServerHandler {
// Ignore error (trace buffer may be full or tracer may have disconnected).
_ = page_trace.try_send(PageTraceEvent {
key,
effective_lsn: batch.effective_request_lsn,
effective_lsn,
time,
});
}
}
// If any request in the batch needs to wait for LSN, then do so now.
let mut perf_instrument = false;
let max_effective_lsn = requests
.iter()
.map(|req| {
if req.ctx.has_perf_span() {
perf_instrument = true;
}
req.effective_request_lsn
})
.max()
.expect("batch is never empty");
let ctx = match perf_instrument {
true => RequestContextBuilder::from(ctx)
.root_perf_span(|| {
info_span!(
target: PERF_TRACE_TARGET,
"GET_VECTORED",
tenant_id = %timeline.tenant_shard_id.tenant_id,
timeline_id = %timeline.timeline_id,
shard = %timeline.tenant_shard_id.shard_slug(),
%max_effective_lsn
)
})
.attached_child(),
false => ctx.attached_child(),
};
let last_record_lsn = timeline.get_last_record_lsn();
if max_effective_lsn > last_record_lsn {
if let Err(e) = timeline
.wait_lsn(
max_effective_lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
timeline::WaitLsnTimeout::Default,
&ctx,
)
.maybe_perf_instrument(&ctx, |current_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: current_perf_span,
"WAIT_LSN",
)
})
.await
{
return Vec::from_iter(requests.into_iter().map(|req| {
Err(BatchedPageStreamError {
err: PageStreamError::from(e.clone()),
req: req.req.hdr,
})
}));
}
}
let results = timeline
.get_rel_page_at_lsn_batched(
requests.iter().map(|p| {
(
&p.req.rel,
&p.req.blkno,
p.effective_request_lsn,
p.ctx.attached_child(),
)
}),
requests
.iter()
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
effective_lsn,
io_concurrency,
&ctx,
ctx,
)
.await;
assert_eq!(results.len(), requests.len());

View File

@@ -6,14 +6,14 @@
//! walingest.rs handles a few things like implicit relation creation and extension.
//! Clarify that)
//!
use std::collections::{HashMap, HashSet, hash_map};
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
use std::ops::{ControlFlow, Range};
use crate::walingest::{WalIngestError, WalIngestErrorKind};
use crate::{PERF_TRACE_TARGET, ensure_walingest};
use anyhow::Context;
use crate::PERF_TRACE_TARGET;
use anyhow::{Context, ensure};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::{
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
@@ -21,7 +21,7 @@ use pageserver_api::key::{
repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
};
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use super::tenant::{PageReconstructError, Timeline};
use crate::aux_file;
use crate::context::{PerfInstrumentFutureExt, RequestContext};
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::{
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -50,7 +50,7 @@ use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
};
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
use crate::tenant::timeline::GetVectoredError;
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -136,8 +136,12 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
#[derive(Debug, thiserror::Error)]
pub enum RelationError {
#[error("Relation Already Exists")]
AlreadyExists,
#[error("invalid relnode")]
InvalidRelnode,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
@@ -206,9 +210,10 @@ impl Timeline {
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
let res = self
.get_rel_page_at_lsn_batched(
pages.iter().map(|(tag, blknum)| {
(tag, blknum, effective_lsn, ctx.attached_child())
}),
pages
.iter()
.map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
effective_lsn,
io_concurrency.clone(),
ctx,
)
@@ -246,7 +251,8 @@ impl Timeline {
/// The ordering of the returned vec corresponds to the ordering of `pages`.
pub(crate) async fn get_rel_page_at_lsn_batched(
&self,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
effective_lsn: Lsn,
io_concurrency: IoConcurrency,
ctx: &RequestContext,
) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -259,13 +265,11 @@ impl Timeline {
let mut result = Vec::with_capacity(pages.len());
let result_slots = result.spare_capacity_mut();
let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
HashMap::with_capacity(pages.len());
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
BTreeMap::default();
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
HashMap::with_capacity(pages.len());
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
let mut perf_instrument = false;
for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
if tag.relnode == 0 {
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
@@ -276,14 +280,14 @@ impl Timeline {
}
let nblocks = match self
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
.get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: crnt_perf_span,
"GET_REL_SIZE",
reltag=%tag,
lsn=%lsn,
lsn=%effective_lsn,
)
})
.await
@@ -299,7 +303,7 @@ impl Timeline {
if *blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag, blknum, lsn, nblocks
tag, blknum, effective_lsn, nblocks
);
result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
slots_filled += 1;
@@ -308,29 +312,46 @@ impl Timeline {
let key = rel_block_to_key(*tag, *blknum);
if ctx.has_perf_span() {
perf_instrument = true;
}
let key_slots = keys_slots.entry(key).or_default();
key_slots.push((response_slot_idx, ctx));
let acc = req_keyspaces.entry(lsn).or_default();
acc.add_key(key);
}
let query: Vec<(Lsn, KeySpace)> = req_keyspaces
.into_iter()
.map(|(lsn, acc)| (lsn, acc.to_keyspace()))
.collect();
let keyspace = {
// add_key requires monotonicity
let mut acc = KeySpaceAccum::new();
for key in keys_slots
.keys()
// in fact it requires strong monotonicity
.dedup()
{
acc.add_key(*key);
}
acc.to_keyspace()
};
let ctx = match perf_instrument {
true => RequestContextBuilder::from(ctx)
.root_perf_span(|| {
info_span!(
target: PERF_TRACE_TARGET,
"GET_VECTORED",
tenant_id = %self.tenant_shard_id.tenant_id,
timeline_id = %self.timeline_id,
lsn = %effective_lsn,
shard = %self.tenant_shard_id.shard_slug(),
)
})
.attached_child(),
false => ctx.attached_child(),
};
let query = VersionedKeySpaceQuery::scattered(query);
let res = self
.get_vectored(query, io_concurrency, ctx)
.maybe_perf_instrument(ctx, |current_perf_span| {
info_span!(
target: PERF_TRACE_TARGET,
parent: current_perf_span,
"GET_BATCH",
batch_size = %page_count,
)
})
.get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
.maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
.await;
match res {
@@ -360,12 +381,12 @@ impl Timeline {
// There is no standardized way to express that the batched span followed from N request spans.
// So, abuse the system and mark the request contexts as follows_from the batch span, so we get
// some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
req_ctx.perf_follows_from(ctx);
req_ctx.perf_follows_from(&ctx);
slots_filled += 1;
}
result_slots[first_slot].write(res);
first_req_ctx.perf_follows_from(ctx);
first_req_ctx.perf_follows_from(&ctx);
slots_filled += 1;
}
}
@@ -404,7 +425,7 @@ impl Timeline {
}
};
req_ctx.perf_follows_from(ctx);
req_ctx.perf_follows_from(&ctx);
result_slots[*slot].write(err);
}
@@ -643,9 +664,8 @@ impl Timeline {
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for batch in batches.parts {
let query = VersionedKeySpaceQuery::uniform(batch, lsn);
let blocks = self
.get_vectored(query, io_concurrency.clone(), ctx)
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
.await?;
for (_key, block) in blocks {
@@ -671,7 +691,7 @@ impl Timeline {
Ok(buf.get_u32_le())
}
/// Does the slru segment exist?
/// Get size of an SLRU segment
pub(crate) async fn get_slru_segment_exists(
&self,
kind: SlruKind,
@@ -824,9 +844,9 @@ impl Timeline {
.await
}
/// Obtain the timestamp for the given lsn.
/// Obtain the possible timestamp range for the given lsn.
///
/// If the lsn has no timestamps (e.g. no commits), returns None.
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
pub(crate) async fn get_timestamp_for_lsn(
&self,
probe_lsn: Lsn,
@@ -882,9 +902,8 @@ impl Timeline {
);
for batch in batches.parts.into_iter().rev() {
let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
let blocks = self
.get_vectored(query, io_concurrency.clone(), ctx)
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
.await?;
for (_key, clog_page) in blocks.into_iter().rev() {
@@ -1459,8 +1478,8 @@ impl DatadirModification<'_> {
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
ensure_walingest!(
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
lsn >= self.lsn,
"setting an older lsn {} than {} is not allowed",
lsn,
@@ -1559,7 +1578,7 @@ impl DatadirModification<'_> {
&mut self,
rel: RelTag,
ctx: &RequestContext,
) -> Result<u32, WalIngestError> {
) -> Result<u32, PageReconstructError> {
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1574,13 +1593,14 @@ impl DatadirModification<'_> {
.await?
{
// create it with 0 size initially, the logic below will extend it
self.put_rel_creation(rel, 0, ctx).await?;
self.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
Ok(0)
} else {
Ok(self
.tline
self.tline
.get_rel_size(rel, Version::Modified(self), ctx)
.await?)
.await
}
}
@@ -1617,14 +1637,11 @@ impl DatadirModification<'_> {
// TODO(vlad): remove this argument and replace the shard check with is_key_local
shard: &ShardIdentity,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
let mut gaps_at_lsns = Vec::default();
for meta in batch.metadata.iter() {
let key = Key::from_compact(meta.key());
let (rel, blkno) = key
.to_rel_block()
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
let new_nblocks = blkno + 1;
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1666,8 +1683,8 @@ impl DatadirModification<'_> {
rel: RelTag,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -1679,7 +1696,7 @@ impl DatadirModification<'_> {
segno: u32,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
if !self.tline.tenant_shard_id.is_shard_zero() {
return Ok(());
}
@@ -1697,11 +1714,14 @@ impl DatadirModification<'_> {
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
}
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
@@ -1713,12 +1733,15 @@ impl DatadirModification<'_> {
segno: u32,
blknum: BlockNumber,
img: Bytes,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
}
self.put(key, Value::Image(img));
Ok(())
@@ -1728,11 +1751,15 @@ impl DatadirModification<'_> {
&mut self,
rel: RelTag,
blknum: BlockNumber,
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
}
let batch = self
@@ -1749,11 +1776,15 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
}
let batch = self
@@ -1801,10 +1832,8 @@ impl DatadirModification<'_> {
dbnode: Oid,
img: Bytes,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1845,13 +1874,13 @@ impl DatadirModification<'_> {
xid: u64,
img: Bytes,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
// Add it to the directory entry
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
if !dir.xids.insert(xid) {
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
@@ -1862,7 +1891,7 @@ impl DatadirModification<'_> {
let xid = xid as u32;
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
if !dir.xids.insert(xid) {
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
@@ -1880,22 +1909,22 @@ impl DatadirModification<'_> {
&mut self,
origin_id: RepOriginId,
origin_lsn: Lsn,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
let key = repl_origin_key(origin_id);
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
Ok(())
}
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
self.set_replorigin(origin_id, Lsn::INVALID).await
}
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CONTROLFILE_KEY, Value::Image(img));
Ok(())
}
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CHECKPOINT_KEY, Value::Image(img));
Ok(())
}
@@ -1905,7 +1934,7 @@ impl DatadirModification<'_> {
spcnode: Oid,
dbnode: Oid,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1944,21 +1973,20 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> Result<(), RelationError> {
if rel.relnode == 0 {
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
"invalid relnode"
)))?;
return Err(RelationError::InvalidRelnode);
}
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let dbdir_exists =
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
// Didn't exist. Update dbdir
e.insert(false);
let buf = DbDirectory::ser(&dbdir)?;
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -1975,25 +2003,27 @@ impl DatadirModification<'_> {
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
let v2_enabled = self.maybe_enable_rel_size_v2()?;
if v2_enabled {
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
return Err(RelationError::AlreadyExists);
}
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
let val = self
.sparse_get(sparse_rel_dir_key, ctx)
.await
.map_err(|e| RelationError::Other(e.into()))?;
let val = RelDirExists::decode_option(val)
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
return Err(RelationError::AlreadyExists);
}
self.put(
sparse_rel_dir_key,
@@ -2009,7 +2039,9 @@ impl DatadirModification<'_> {
// will be key not found errors if we don't create an empty one for rel_size_v2.
self.put(
rel_dir_key,
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
Value::Image(Bytes::from(
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
)),
);
}
self.pending_directory_entries
@@ -2017,7 +2049,7 @@ impl DatadirModification<'_> {
} else {
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
return Err(RelationError::AlreadyExists);
}
if !dbdir_exists {
self.pending_directory_entries
@@ -2027,7 +2059,9 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
self.put(
rel_dir_key,
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
}
@@ -2052,8 +2086,8 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
if self
.tline
.get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2083,8 +2117,8 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
// Put size
let size_key = rel_size_to_key(rel);
@@ -2108,10 +2142,8 @@ impl DatadirModification<'_> {
&mut self,
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
@@ -2131,7 +2163,7 @@ impl DatadirModification<'_> {
let key =
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2174,7 +2206,7 @@ impl DatadirModification<'_> {
segno: u32,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
// Add it to the directory entry
@@ -2183,7 +2215,7 @@ impl DatadirModification<'_> {
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.insert(segno) {
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
}
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
@@ -2210,7 +2242,7 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
// Put size
@@ -2226,7 +2258,7 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
// Remove it from the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key, ctx).await?;
@@ -2251,7 +2283,7 @@ impl DatadirModification<'_> {
}
/// Drop a relmapper file (pg_filenode.map)
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
// TODO
Ok(())
}
@@ -2261,7 +2293,7 @@ impl DatadirModification<'_> {
&mut self,
xid: u64,
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2276,8 +2308,7 @@ impl DatadirModification<'_> {
));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid: u32 = u32::try_from(xid)
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
let xid: u32 = u32::try_from(xid)?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -2302,7 +2333,7 @@ impl DatadirModification<'_> {
path: &str,
content: &[u8],
ctx: &RequestContext,
) -> Result<(), WalIngestError> {
) -> anyhow::Result<()> {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
@@ -2311,7 +2342,7 @@ impl DatadirModification<'_> {
Err(e) => return Err(e.into()),
};
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
@@ -2356,8 +2387,7 @@ impl DatadirModification<'_> {
}
(None, true) => warn!("removing non-existing aux file: {}", path),
}
let new_val = aux_file::encode_file_value(&new_files)
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
Ok(())

File diff suppressed because it is too large Load Diff

View File

@@ -15,14 +15,13 @@
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use std::cmp::min;
use std::io::Error;
use std::io::{Error, ErrorKind};
use async_compression::Level;
use bytes::{BufMut, BytesMut};
use pageserver_api::models::ImageCompressionAlgorithm;
use tokio::io::AsyncWriteExt;
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
use tokio_util::sync::CancellationToken;
use tracing::warn;
use crate::context::RequestContext;
@@ -170,13 +169,7 @@ pub struct BlobWriter<const BUFFERED: bool> {
}
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
pub fn new(
inner: VirtualFile,
start_offset: u64,
_gate: &utils::sync::gate::Gate,
_cancel: CancellationToken,
_ctx: &RequestContext,
) -> Self {
pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
Self {
inner,
offset: start_offset,
@@ -338,7 +331,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
return (
(
io_buf.slice_len(),
Err(Error::other(format!("blob too large ({len} bytes)"))),
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
)),
),
srcbuf,
);
@@ -439,14 +435,12 @@ pub(crate) mod tests {
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
let temp_dir = camino_tempfile::tempdir()?;
let pathbuf = temp_dir.path().join("file");
let gate = utils::sync::gate::Gate::default();
let cancel = CancellationToken::new();
// Write part (in block to drop the file)
let mut offsets = Vec::new();
{
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
let res = wtr

View File

@@ -216,8 +216,12 @@ impl<'a> FileBlockReader<'a> {
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.await
.map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
{
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => Ok(guard.into()),
ReadBufResult::NotFound(write_guard) => {
// Read the page from disk into the buffer

View File

@@ -714,7 +714,7 @@ impl LayerMap {
true
}
pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
self.historic.iter()
}

View File

@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
}
/// Iterate all the layers
pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
// NOTE we can actually perform this without rebuilding,
// but it's not necessary for now.
if !self.buffer.is_empty() {

View File

@@ -1,33 +1,21 @@
use chrono::NaiveDateTime;
use pageserver_api::shard::ShardStripeSize;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
use utils::lsn::Lsn;
/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
/// shard-wide information that must be persisted in remote storage.
///
/// The manifest is always updated on tenant attach, and as needed.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
/// Tenant-shard scoped manifest
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct TenantManifest {
/// The manifest version. Incremented on manifest format changes, even non-breaking ones.
/// Manifests must generally always be backwards and forwards compatible for one release, to
/// allow release rollbacks.
/// Debugging aid describing the version of this manifest.
/// Can also be used for distinguishing breaking changes later on.
pub version: usize,
/// This tenant's stripe size. This is only advisory, and used to recover tenant data from
/// remote storage. The autoritative source is the storage controller. If None, assume the
/// original default value of 32768 blocks (256 MB).
#[serde(skip_serializing_if = "Option::is_none")]
pub stripe_size: Option<ShardStripeSize>,
/// The list of offloaded timelines together with enough information
/// to not have to actually load them.
///
/// Note: the timelines mentioned in this list might be deleted, i.e.
/// we don't hold an invariant that the references aren't dangling.
/// Existence of index-part.json is the actual indicator of timeline existence.
#[serde(default)]
pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
}
@@ -36,7 +24,7 @@ pub struct TenantManifest {
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
/// but the two datastructures serve different needs, this is for a persistent disk format
/// that must be backwards compatible, while the other is only for informative purposes.
#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)]
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
pub struct OffloadedTimelineManifest {
pub timeline_id: TimelineId,
/// Whether the timeline has a parent it has been branched off from or not
@@ -47,166 +35,20 @@ pub struct OffloadedTimelineManifest {
pub archived_at: NaiveDateTime,
}
/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
/// do not use deny_unknown_fields, so new fields are not breaking.
///
/// 1: initial version
/// 2: +stripe_size
///
/// When adding new versions, also add a parse_vX test case below.
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
impl TenantManifest {
/// Returns true if the manifests are equal, ignoring the version number. This avoids
/// re-uploading all manifests just because the version number is bumped.
pub fn eq_ignoring_version(&self, other: &Self) -> bool {
// Fast path: if the version is equal, just compare directly.
if self.version == other.version {
return self == other;
pub(crate) fn empty() -> Self {
Self {
version: LATEST_TENANT_MANIFEST_VERSION,
offloaded_timelines: vec![],
}
// We could alternatively just clone and modify the version here.
let Self {
version: _, // ignore version
stripe_size,
offloaded_timelines,
} = self;
stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
}
/// Decodes a manifest from JSON.
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice(bytes)
serde_json::from_slice::<Self>(bytes)
}
/// Encodes a manifest as JSON.
pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use utils::id::TimelineId;
use super::*;
/// Empty manifests should be parsed. Version is required.
#[test]
fn parse_empty() -> anyhow::Result<()> {
let json = r#"{
"version": 0
}"#;
let expected = TenantManifest {
version: 0,
stripe_size: None,
offloaded_timelines: Vec::new(),
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
Ok(())
}
/// Unknown fields should be ignored, for forwards compatibility.
#[test]
fn parse_unknown_fields() -> anyhow::Result<()> {
let json = r#"{
"version": 1,
"foo": "bar"
}"#;
let expected = TenantManifest {
version: 1,
stripe_size: None,
offloaded_timelines: Vec::new(),
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
Ok(())
}
/// v1 manifests should be parsed, for backwards compatibility.
#[test]
fn parse_v1() -> anyhow::Result<()> {
let json = r#"{
"version": 1,
"offloaded_timelines": [
{
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"archived_at": "2025-03-07T11:07:11.373105434"
},
{
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"ancestor_retain_lsn": "0/1F79038",
"archived_at": "2025-03-05T11:10:22.257901390"
}
]
}"#;
let expected = TenantManifest {
version: 1,
stripe_size: None,
offloaded_timelines: vec![
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
ancestor_timeline_id: None,
ancestor_retain_lsn: None,
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
},
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
ancestor_timeline_id: Some(TimelineId::from_str(
"5c4df612fd159e63c1b7853fe94d97da",
)?),
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
},
],
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
Ok(())
}
/// v2 manifests should be parsed, for backwards compatibility.
#[test]
fn parse_v2() -> anyhow::Result<()> {
let json = r#"{
"version": 2,
"stripe_size": 32768,
"offloaded_timelines": [
{
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"archived_at": "2025-03-07T11:07:11.373105434"
},
{
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"ancestor_retain_lsn": "0/1F79038",
"archived_at": "2025-03-05T11:10:22.257901390"
}
]
}"#;
let expected = TenantManifest {
version: 2,
stripe_size: Some(ShardStripeSize(32768)),
offloaded_timelines: vec![
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
ancestor_timeline_id: None,
ancestor_retain_lsn: None,
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
},
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
ancestor_timeline_id: Some(TimelineId::from_str(
"5c4df612fd159e63c1b7853fe94d97da",
)?),
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
},
],
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
Ok(())
}
}

View File

@@ -61,7 +61,6 @@ pub(crate) async fn upload_index_part(
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
}
/// Serializes and uploads the given tenant manifest data to the remote storage.
pub(crate) async fn upload_tenant_manifest(
storage: &GenericRemoteStorage,
@@ -77,14 +76,16 @@ pub(crate) async fn upload_tenant_manifest(
});
pausable_failpoint!("before-upload-manifest-pausable");
let serialized = Bytes::from(tenant_manifest.to_json_bytes()?);
let tenant_manifest_size = serialized.len();
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
let serialized = tenant_manifest.to_json_bytes()?;
let serialized = Bytes::from(serialized);
let tenant_manifest_site = serialized.len();
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
storage
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(serialized))),
tenant_manifest_size,
tenant_manifest_site,
&remote_path,
cancel,
)

View File

@@ -715,34 +715,13 @@ pub(crate) enum LayerId {
}
/// Uniquely identify a layer visit by the layer
/// and LSN range of the reads. Note that the end of the range is exclusive.
///
/// The layer itself is not enough since we may have different LSN lower
/// bounds for delta layer reads. Scenarios where this can happen are:
///
/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
/// and a query that only partially hits the image layer. Part of the query
/// needs to read the whole in-memory layer and the other part needs to read
/// only up to the image layer. Hence, they'll have different LSN floor values
/// for the read.
///
/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
/// The start LSN for one range is inside a layer and the start LSN for another range
/// Is above the layer (includes all of it). Both ranges need to read the layer all the
/// Way to the end but starting at different points. Hence, they'll have different LSN
/// Ceil values.
///
/// The implication is that we might visit the same layer multiple times
/// in order to read different LSN ranges from it. In practice, this isn't very concerning
/// because:
/// 1. Layer overlaps are rare and generally not intended
/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
/// are grouped tightly enough (likely the case).
/// and LSN floor (or start LSN) of the reads.
/// The layer itself is not enough since we may
/// have different LSN lower bounds for delta layer reads.
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
struct LayerToVisitId {
layer_id: LayerId,
lsn_floor: Lsn,
lsn_ceil: Lsn,
}
#[derive(Debug, PartialEq, Eq, Hash)]
@@ -826,7 +805,6 @@ impl LayerFringe {
let layer_to_visit_id = LayerToVisitId {
layer_id: layer.id(),
lsn_floor: lsn_range.start,
lsn_ceil: lsn_range.end,
};
let entry = self.visit_reads.entry(layer_to_visit_id.clone());

Some files were not shown because too many files have changed in this diff Show More