mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-18 01:50:37 +00:00
Compare commits
60 Commits
luist18/pg
...
skyzh/feat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7f27254392 | ||
|
|
2f338daf17 | ||
|
|
75638230b2 | ||
|
|
1ad48b2eaf | ||
|
|
a338984dc7 | ||
|
|
8936a7abd8 | ||
|
|
946e971df8 | ||
|
|
d109bf8c1d | ||
|
|
4f7b2cdd4f | ||
|
|
66f56ddaec | ||
|
|
fd16caa7d0 | ||
|
|
ff5a527167 | ||
|
|
c66444ea15 | ||
|
|
88f01c1ca1 | ||
|
|
a6937a3281 | ||
|
|
3c8565a194 | ||
|
|
979fa0682b | ||
|
|
8884865bca | ||
|
|
4c4e33bc2e | ||
|
|
342607473a | ||
|
|
9c37bfc90a | ||
|
|
52dee408dc | ||
|
|
5487a20b72 | ||
|
|
f06d721a98 | ||
|
|
2e35f23085 | ||
|
|
5063151271 | ||
|
|
0122d97f95 | ||
|
|
fae7528adb | ||
|
|
8a72e6f888 | ||
|
|
a04e33ceb6 | ||
|
|
af0be11503 | ||
|
|
405a17bf0b | ||
|
|
63ee8e2181 | ||
|
|
2c21a65b0b | ||
|
|
ec66b788e2 | ||
|
|
af12647b9d | ||
|
|
1c237d0c6d | ||
|
|
afd34291ca | ||
|
|
66f80e77ba | ||
|
|
72832b3214 | ||
|
|
d11f23a341 | ||
|
|
e7502a3d63 | ||
|
|
ef8101a9be | ||
|
|
d2825e72ad | ||
|
|
a6ff8ec3d4 | ||
|
|
cf62017a5b | ||
|
|
c610f3584d | ||
|
|
c9ca8b7c4a | ||
|
|
7679b63a2c | ||
|
|
d177654e5f | ||
|
|
a09c933de3 | ||
|
|
6138d61592 | ||
|
|
a7142f3bc6 | ||
|
|
7791a49dd4 | ||
|
|
8a6d0dccaa | ||
|
|
7ffcbfde9a | ||
|
|
b2a0b2e9dd | ||
|
|
0875dacce0 | ||
|
|
99d8788756 | ||
|
|
26c5c7e942 |
@@ -19,6 +19,7 @@
|
||||
!pageserver/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!object_storage/
|
||||
!storage_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
/artifact_cache
|
||||
/pg_install
|
||||
/target
|
||||
/tmp_check
|
||||
|
||||
56
Cargo.lock
generated
56
Cargo.lock
generated
@@ -2837,6 +2837,7 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3991,6 +3992,33 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"futures",
|
||||
"http-body-util",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"test-log",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower 0.5.2",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.20.2"
|
||||
@@ -4693,7 +4721,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.6"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
@@ -4727,7 +4755,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.6"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -6925,6 +6953,28 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-log"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"test-log-macros",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-log-macros"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
@@ -7172,7 +7222,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
|
||||
@@ -40,6 +40,7 @@ members = [
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"object_storage",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28"
|
||||
tracing-serde = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
test-log = { version = "0.2.17", default-features = false, features = ["log"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
|
||||
@@ -89,6 +89,7 @@ RUN set -e \
|
||||
--bin storage_broker \
|
||||
--bin storage_controller \
|
||||
--bin proxy \
|
||||
--bin object_storage \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release
|
||||
@@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
|
||||
|
||||
@@ -1527,51 +1527,6 @@ COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pgauditlogtofile-src
|
||||
RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_rest-build"
|
||||
# compile pg_rest extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg_rest-src
|
||||
ARG PG_VERSION
|
||||
ARG PG_REST_VERSION=3.0.1
|
||||
|
||||
# Only supported for PostgreSQL v17
|
||||
RUN if [ "${PG_VERSION:?}" != "v17" ]; then \
|
||||
echo "pg_rest extension is only supported for PostgreSQL v17" && exit 0; \
|
||||
fi
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN mkdir -p pg_rest-src && cd pg_rest-src && \
|
||||
wget https://github.com/ruslantalpa/foxfirebase/raw/main/pg_rest_pg17-${PG_REST_VERSION}_neon-debian-bookworm_aarch64.deb -O pg_rest_pg17-${PG_REST_VERSION}_aarch64.deb && \
|
||||
wget https://github.com/ruslantalpa/foxfirebase/raw/main/pg_rest_pg17-${PG_REST_VERSION}_neon-debian-bookworm_amd64.deb -O pg_rest_pg17-${PG_REST_VERSION}_amd64.deb
|
||||
|
||||
FROM pg-build AS pg_rest-build
|
||||
ARG PG_REST_VERSION=3.0.1
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg_rest-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pg_rest-src
|
||||
RUN if [ "${PG_VERSION:?}" = "v17" ]; then \
|
||||
export ARCH=$(uname -m) && \
|
||||
echo "ARCH: $ARCH" && \
|
||||
# Map architecture names to package names
|
||||
if [ "$ARCH" = "x86_64" ]; then \
|
||||
PACKAGE_ARCH="amd64"; \
|
||||
else \
|
||||
PACKAGE_ARCH="$ARCH"; \
|
||||
fi && \
|
||||
echo "Using package architecture: $PACKAGE_ARCH" && \
|
||||
apt-get update && \
|
||||
apt-get install -y ./pg_rest_pg17-${PG_REST_VERSION}_${PACKAGE_ARCH}.deb && \
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_rest.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_rest.control && \
|
||||
# Clean up
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*; \
|
||||
else \
|
||||
echo "pg_rest extension is only supported for PostgreSQL v17, skipping build"; \
|
||||
fi
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-ext-build"
|
||||
@@ -1667,7 +1622,6 @@ COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_rest-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1844,6 +1798,7 @@ COPY --from=pg_repack-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY compute/patches/pg_repack.patch /ext-src
|
||||
RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/*
|
||||
@@ -1976,4 +1931,3 @@ RUN mkdir /var/run/rsyslogd && \
|
||||
ENV LANG=en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
|
||||
|
||||
@@ -29,13 +29,12 @@
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -c /var/db/postgres/configs/config.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
@@ -43,8 +42,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
|
||||
};
|
||||
@@ -118,16 +116,21 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
pub set_disk_quota_for_fs: Option<String>,
|
||||
|
||||
#[arg(short = 's', long = "spec", group = "spec")]
|
||||
pub spec_json: Option<String>,
|
||||
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
// TODO(tristan957): remove alias after compatibility tests are no longer
|
||||
// an issue
|
||||
#[arg(short = 'c', long, alias = "spec-path")]
|
||||
pub config: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id")]
|
||||
pub compute_id: String,
|
||||
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
#[arg(
|
||||
short = 'p',
|
||||
long,
|
||||
conflicts_with = "config",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL",
|
||||
requires = "compute-id"
|
||||
)]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
|
||||
@@ -152,7 +155,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
let cli_spec = get_config(&cli)?;
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
@@ -172,7 +175,6 @@ fn main() -> Result<()> {
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
live_config_allowed: cli_spec.live_config_allowed,
|
||||
},
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
@@ -200,37 +202,17 @@ async fn init() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
if let Some(ref spec_json) = cli.spec_json {
|
||||
info!("got spec from cli argument {}", spec_json);
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_str(spec_json)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: false,
|
||||
});
|
||||
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
|
||||
// First, read the config from the path if provided
|
||||
if let Some(ref config) = cli.config {
|
||||
let file = File::open(config)?;
|
||||
return Ok(serde_json::from_reader(&file)?);
|
||||
}
|
||||
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(ref spec_path) = cli.spec_path {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: true,
|
||||
});
|
||||
}
|
||||
|
||||
if cli.control_plane_uri.is_none() {
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(resp) => Ok(CliSpecParams {
|
||||
spec: resp.0,
|
||||
compute_ctl_config: resp.1,
|
||||
live_config_allowed: true,
|
||||
}),
|
||||
// If the config wasn't provided in the CLI arguments, then retrieve it from
|
||||
// the control plane
|
||||
match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(config) => Ok(config),
|
||||
Err(e) => {
|
||||
error!(
|
||||
"cannot get response from control plane: {}\n\
|
||||
@@ -242,14 +224,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
}
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
#[allow(dead_code)]
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
|
||||
@@ -98,13 +98,15 @@ pub async fn get_database_schema(
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||
})?;
|
||||
let stdout = cmd
|
||||
.stdout
|
||||
.take()
|
||||
.ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;
|
||||
|
||||
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||
})?;
|
||||
let stderr = cmd
|
||||
.stderr
|
||||
.take()
|
||||
.ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;
|
||||
|
||||
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
@@ -128,8 +130,7 @@ pub async fn get_database_schema(
|
||||
}
|
||||
});
|
||||
|
||||
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
return Err(SchemaDumpError::IO(std::io::Error::other(
|
||||
"failed to start pg_dump",
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -93,20 +93,6 @@ pub struct ComputeNodeParams {
|
||||
|
||||
/// the address of extension storage proxy gateway
|
||||
pub ext_remote_storage: Option<String>,
|
||||
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
/// - we start compute with some spec provided as argument
|
||||
/// - we push new spec and it does reconfiguration
|
||||
/// - but then something happens and compute pod / VM is destroyed,
|
||||
/// so k8s controller starts it again with the **old** spec
|
||||
///
|
||||
/// and the same for empty computes:
|
||||
/// - we started compute without any spec
|
||||
/// - we push spec and it does configuration
|
||||
/// - but then it is restarted without any spec again
|
||||
pub live_config_allowed: bool,
|
||||
}
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -537,11 +523,14 @@ impl ComputeNode {
|
||||
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
pspec.tenant_id,
|
||||
pspec.timeline_id,
|
||||
pspec.spec.project_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.branch_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.features,
|
||||
pspec.spec.remote_extensions,
|
||||
);
|
||||
@@ -645,31 +634,28 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for HIPAA if necessary
|
||||
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
|
||||
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
// Configure and start rsyslog for compliance audit logging
|
||||
match pspec.spec.audit_log_level {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
let remote_endpoint =
|
||||
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for Postgres logs export
|
||||
if self.has_feature(ComputeFeature::PostgresLogsExport) {
|
||||
if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
|
||||
let host = PostgresLogsRsyslogConfig::default_host(project_id);
|
||||
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
|
||||
configure_postgres_logs_export(conf)?;
|
||||
} else {
|
||||
warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
|
||||
}
|
||||
}
|
||||
let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
|
||||
configure_postgres_logs_export(conf)?;
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(self);
|
||||
@@ -1573,6 +1559,10 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Reconfigure rsyslog for Postgres logs export
|
||||
let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
|
||||
configure_postgres_logs_export(conf)?;
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
config::write_postgres_conf(
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use compute_api::responses::TlsConfig;
|
||||
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.project_id {
|
||||
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.branch_id {
|
||||
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.endpoint_id {
|
||||
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
// tls
|
||||
if let Some(tls_config) = tls_config {
|
||||
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
match spec.audit_log_level {
|
||||
ComputeAudit::Disabled => {}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
|
||||
writeln!(file, "pgaudit.log='ddl,role'")?;
|
||||
// Disable logging of catalog queries to reduce the noise
|
||||
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
|
||||
}
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
writeln!(
|
||||
file,
|
||||
"# Managed by compute_ctl compliance audit settings: begin"
|
||||
)?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
// Exclude 'misc' category, because it doesn't contain anythig relevant.
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Enable logging of parameters.
|
||||
// This is very verbose and may contain sensitive data.
|
||||
if spec.audit_log_level == ComputeAudit::Full {
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
} else {
|
||||
writeln!(file, "pgaudit.log_parameter=off")?;
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
}
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
@@ -255,7 +268,7 @@ pub fn write_postgres_conf(
|
||||
|
||||
// We need Postgres to send logs to rsyslog so that we can forward them
|
||||
// further to customers' log aggregation systems.
|
||||
if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
|
||||
if spec.logs_export_host.is_some() {
|
||||
writeln!(file, "log_destination='stderr,syslog'")?;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,20 +6,15 @@ use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
};
|
||||
use compute_api::requests::ComputeClaims;
|
||||
use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use serde::Deserialize;
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub(in crate::http) struct Claims {
|
||||
compute_id: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
compute_id: String,
|
||||
@@ -112,7 +107,11 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
|
||||
impl Authorize {
|
||||
/// Verify the token using the JSON Web Key set and return the token data.
|
||||
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
|
||||
fn verify(
|
||||
jwks: &JwkSet,
|
||||
token: &str,
|
||||
validation: &Validation,
|
||||
) -> Result<TokenData<ComputeClaims>> {
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
@@ -127,7 +126,7 @@ impl Authorize {
|
||||
}
|
||||
};
|
||||
|
||||
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
|
||||
match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
|
||||
@@ -306,36 +306,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/configure_telemetry:
|
||||
post:
|
||||
tags:
|
||||
- Configure
|
||||
summary: Configure rsyslog
|
||||
description: |
|
||||
This API endpoint configures rsyslog to forward Postgres logs
|
||||
to a specified otel collector.
|
||||
operationId: configureTelemetry
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
logs_export_host:
|
||||
type: string
|
||||
description: |
|
||||
Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
|
||||
Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
|
||||
responses:
|
||||
204:
|
||||
description: "Telemetry configured successfully"
|
||||
500:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::body::Body;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::ComputeFeature;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
@@ -13,7 +11,6 @@ use tracing::info;
|
||||
use crate::compute::{ComputeNode, ParsedSpec};
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If anything
|
||||
// goes wrong after we set the compute status to `ConfigurationPending` and
|
||||
@@ -25,13 +22,6 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
if !compute.params.live_config_allowed {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
@@ -95,25 +85,3 @@ pub(in crate::http) async fn configure(
|
||||
|
||||
JsonResponse::success(StatusCode::OK, body)
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn configure_telemetry(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigureTelemetryRequest>,
|
||||
) -> Response {
|
||||
if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"Postgres logs export feature is not enabled".to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
|
||||
if let Err(err) = configure_postgres_logs_export(conf) {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
|
||||
}
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::from(""))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -87,7 +87,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/configure_telemetry", post(configure::configure_telemetry))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
|
||||
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
|
||||
// And it's fair to call it a 'RPC' (Remote Procedure Call).
|
||||
pub enum CPlaneRequestRPC {
|
||||
GetSpec,
|
||||
GetConfig,
|
||||
}
|
||||
|
||||
impl CPlaneRequestRPC {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
CPlaneRequestRPC::GetSpec => "GetSpec",
|
||||
CPlaneRequestRPC::GetConfig => "GetConfig",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
|
||||
};
|
||||
Ok(config_content)
|
||||
}
|
||||
|
||||
/// Returns the default host for otel collector that receives Postgres logs
|
||||
pub fn default_host(project_id: &str) -> String {
|
||||
format!(
|
||||
"config-{}-collector.neon-telemetry.svc.cluster.local:10514",
|
||||
project_id
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
|
||||
pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
|
||||
let new_config = conf.build()?;
|
||||
let current_config = PostgresLogsRsyslogConfig::current_config()?;
|
||||
@@ -261,16 +254,5 @@ mod tests {
|
||||
let res = conf.build();
|
||||
assert!(res.is_err());
|
||||
}
|
||||
|
||||
{
|
||||
// Verify config with default host
|
||||
let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
|
||||
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
|
||||
let res = conf.build();
|
||||
assert!(res.is_ok());
|
||||
let conf_str = res.unwrap();
|
||||
assert!(conf_str.contains(r#"shy-breeze-123"#));
|
||||
assert!(conf_str.contains(r#"port="10514""#));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,8 @@ use std::path::Path;
|
||||
|
||||
use anyhow::{Result, anyhow, bail};
|
||||
use compute_api::responses::{
|
||||
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
|
||||
ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
|
||||
};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use reqwest::StatusCode;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument};
|
||||
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
|
||||
fn do_control_plane_request(
|
||||
uri: &str,
|
||||
jwt: &str,
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
|
||||
) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
@@ -29,14 +28,14 @@ fn do_control_plane_request(
|
||||
.map_err(|e| {
|
||||
(
|
||||
true,
|
||||
format!("could not perform spec request to control plane: {:?}", e),
|
||||
format!("could not perform request to control plane: {:?}", e),
|
||||
UNKNOWN_HTTP_STATUS.to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
let status = resp.status();
|
||||
match status {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
|
||||
Ok(spec_resp) => Ok(spec_resp),
|
||||
Err(e) => Err((
|
||||
true,
|
||||
@@ -69,40 +68,35 @@ fn do_control_plane_request(
|
||||
}
|
||||
}
|
||||
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
|
||||
/// env variable is set, it will be used for authorization.
|
||||
pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
|
||||
/// Request config from the control-plane by compute_id. If
|
||||
/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
|
||||
/// authorization.
|
||||
pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
|
||||
let mut attempt = 1;
|
||||
|
||||
info!("getting spec from control plane: {}", cp_uri);
|
||||
info!("getting config from control plane: {}", cp_uri);
|
||||
|
||||
// Do 3 attempts to get spec from the control plane using the following logic:
|
||||
// - network error -> then retry
|
||||
// - compute id is unknown or any other error -> bail out
|
||||
// - no spec for compute yet (Empty state) -> return Ok(None)
|
||||
// - got spec -> return Ok(Some(spec))
|
||||
// - got config -> return Ok(Some(config))
|
||||
while attempt < 4 {
|
||||
let result = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
Ok(spec_resp) => {
|
||||
Ok(config_resp) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[
|
||||
CPlaneRequestRPC::GetSpec.as_str(),
|
||||
CPlaneRequestRPC::GetConfig.as_str(),
|
||||
&StatusCode::OK.to_string(),
|
||||
])
|
||||
.inc();
|
||||
match spec_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
|
||||
match config_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
|
||||
ControlPlaneComputeStatus::Attached => {
|
||||
if let Some(spec) = spec_resp.spec {
|
||||
Ok((Some(spec), spec_resp.compute_ctl_config))
|
||||
if config_resp.spec.is_some() {
|
||||
Ok(config_resp.into())
|
||||
} else {
|
||||
bail!("compute is attached, but spec is empty")
|
||||
}
|
||||
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
|
||||
}
|
||||
Err((retry, msg, status)) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
|
||||
.with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
|
||||
.inc();
|
||||
if retry {
|
||||
Err(anyhow!(msg))
|
||||
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
|
||||
};
|
||||
|
||||
if let Err(e) = &result {
|
||||
error!("attempt {} to get spec failed with: {}", attempt, e);
|
||||
error!("attempt {} to get config failed with: {}", attempt, e);
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(
|
||||
|
||||
// All attempts failed, return error.
|
||||
Err(anyhow::anyhow!(
|
||||
"Exhausted all attempts to retrieve the spec from the control plane"
|
||||
"Exhausted all attempts to retrieve the config from the control plane"
|
||||
))
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
// XXX: consider making it a part of config.json
|
||||
let pghba_path = pgdata_path.join("pg_hba.conf");
|
||||
|
||||
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
|
||||
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Create a standby.signal file
|
||||
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
// XXX: consider making it a part of config.json
|
||||
let signalfile = pgdata_path.join("standby.signal");
|
||||
|
||||
if !signalfile.exists() {
|
||||
|
||||
@@ -278,12 +278,12 @@ impl ComputeNode {
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
|
||||
@@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{
|
||||
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
|
||||
SafekeeperConf,
|
||||
ObjectStorageConf, SafekeeperConf,
|
||||
};
|
||||
use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
|
||||
use control_plane::object_storage::ObjectStorage;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::{
|
||||
@@ -39,7 +41,7 @@ use pageserver_api::controller_api::{
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
@@ -91,6 +93,8 @@ enum NeonLocalCmd {
|
||||
#[command(subcommand)]
|
||||
Safekeeper(SafekeeperCmd),
|
||||
#[command(subcommand)]
|
||||
ObjectStorage(ObjectStorageCmd),
|
||||
#[command(subcommand)]
|
||||
Endpoint(EndpointCmd),
|
||||
#[command(subcommand)]
|
||||
Mappings(MappingsCmd),
|
||||
@@ -454,6 +458,32 @@ enum SafekeeperCmd {
|
||||
Restart(SafekeeperRestartCmdArgs),
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage object storage")]
|
||||
enum ObjectStorageCmd {
|
||||
Start(ObjectStorageStartCmd),
|
||||
Stop(ObjectStorageStopCmd),
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start object storage")]
|
||||
struct ObjectStorageStartCmd {
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop object storage")]
|
||||
struct ObjectStorageStopCmd {
|
||||
#[arg(value_enum, default_value = "fast")]
|
||||
#[clap(
|
||||
short = 'm',
|
||||
help = "If 'immediate', don't flush repository data at shutdown"
|
||||
)]
|
||||
stop_mode: StopMode,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Start local safekeeper")]
|
||||
struct SafekeeperStartCmdArgs {
|
||||
@@ -759,6 +789,7 @@ fn main() -> Result<()> {
|
||||
}
|
||||
NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
|
||||
NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
|
||||
NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
|
||||
NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
|
||||
NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
|
||||
};
|
||||
@@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
object_storage: ObjectStorageConf {
|
||||
port: OBJECT_STORAGE_DEFAULT_PORT,
|
||||
},
|
||||
pg_distrib_dir: None,
|
||||
neon_distrib_dir: None,
|
||||
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||
@@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
|
||||
stripe_size: args
|
||||
.shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
.unwrap_or(DEFAULT_STRIPE_SIZE),
|
||||
},
|
||||
placement_policy: args.placement_policy.clone(),
|
||||
config: tenant_conf,
|
||||
@@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
@@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
|
||||
use ObjectStorageCmd::*;
|
||||
let storage = ObjectStorage::from_env(env);
|
||||
|
||||
// In tests like test_forward_compatibility or test_graceful_cluster_restart
|
||||
// old neon binaries (without object_storage) are present
|
||||
if !storage.bin.exists() {
|
||||
eprintln!(
|
||||
"{} binary not found. Ignore if this is a compatibility test",
|
||||
storage.bin
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match subcmd {
|
||||
Start(ObjectStorageStartCmd { start_timeout }) => {
|
||||
if let Err(e) = storage.start(start_timeout).await {
|
||||
eprintln!("object_storage start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Stop(ObjectStorageStopCmd { stop_mode }) => {
|
||||
let immediate = match stop_mode {
|
||||
StopMode::Fast => false,
|
||||
StopMode::Immediate => true,
|
||||
};
|
||||
if let Err(e) = storage.stop(immediate) {
|
||||
eprintln!("proxy stop failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match subcmd {
|
||||
StorageBrokerCmd::Start(args) => {
|
||||
@@ -1777,6 +1846,13 @@ async fn handle_start_all_impl(
|
||||
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
|
||||
});
|
||||
}
|
||||
|
||||
js.spawn(async move {
|
||||
ObjectStorage::from_env(env)
|
||||
.start(&retry_timeout)
|
||||
.await
|
||||
.map_err(|e| e.context("start object_storage"))
|
||||
});
|
||||
})();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
@@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
}
|
||||
|
||||
let storage = ObjectStorage::from_env(env);
|
||||
if let Err(e) = storage.stop(immediate) {
|
||||
eprintln!("object_storage stop failed: {:#}", e);
|
||||
}
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.stop(immediate) {
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
//! postgresql.conf - postgresql settings
|
||||
//! spec.json - passed to `compute_ctl`
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
@@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
@@ -619,86 +621,101 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// Create spec file
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
project_id: None,
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
ComputeConfig {
|
||||
spec: Some(spec),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
}
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
// TODO(tristan957): Remove the write to spec.json after compatibility
|
||||
// tests work themselves out
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
|
||||
|
||||
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
|
||||
let logfile = std::fs::OpenOptions::new()
|
||||
@@ -706,6 +723,16 @@ impl Endpoint {
|
||||
.append(true)
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// TODO(tristan957): Remove when compatibility tests are no longer an
|
||||
// issue
|
||||
let old_compute_ctl = {
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
let help_output = cmd.arg("--help").output()?;
|
||||
let help_output = String::from_utf8_lossy(&help_output.stdout);
|
||||
|
||||
!help_output.contains("--config")
|
||||
};
|
||||
|
||||
// Launch compute_ctl
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
@@ -724,9 +751,18 @@ impl Endpoint {
|
||||
])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
// TODO(tristan957): Change this to --config when compatibility tests
|
||||
// are no longer an issue
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
self.endpoint_path()
|
||||
.join(if old_compute_ctl {
|
||||
"spec.json"
|
||||
} else {
|
||||
"config.json"
|
||||
})
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
@@ -869,10 +905,12 @@ impl Endpoint {
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
let config: ComputeConfig = serde_json::from_reader(file)?;
|
||||
|
||||
(config.spec.unwrap(), config.compute_ctl_config)
|
||||
};
|
||||
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
@@ -922,7 +960,7 @@ impl Endpoint {
|
||||
.body(
|
||||
serde_json::to_string(&ConfigurationRequest {
|
||||
spec,
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
compute_ctl_config,
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
|
||||
@@ -10,6 +10,7 @@ mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
pub mod local_env;
|
||||
pub mod object_storage;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -15,9 +15,10 @@ use clap::ValueEnum;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::auth::{Claims, encode_from_key_file};
|
||||
use utils::auth::encode_from_key_file;
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
|
||||
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -55,6 +56,7 @@ pub struct LocalEnv {
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
pub private_key_path: PathBuf,
|
||||
pub public_key_path: PathBuf,
|
||||
|
||||
pub broker: NeonBroker,
|
||||
|
||||
@@ -68,6 +70,8 @@ pub struct LocalEnv {
|
||||
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
|
||||
pub object_storage: ObjectStorageConf,
|
||||
|
||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||
// be propagated into each pageserver's configuration.
|
||||
pub control_plane_api: Url,
|
||||
@@ -95,6 +99,7 @@ pub struct OnDiskConfig {
|
||||
pub neon_distrib_dir: PathBuf,
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
pub private_key_path: PathBuf,
|
||||
pub public_key_path: PathBuf,
|
||||
pub broker: NeonBroker,
|
||||
pub storage_controller: NeonStorageControllerConf,
|
||||
#[serde(
|
||||
@@ -103,6 +108,7 @@ pub struct OnDiskConfig {
|
||||
)]
|
||||
pub pageservers: Vec<PageServerConf>,
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub object_storage: ObjectStorageConf,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_hooks_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
@@ -136,11 +142,18 @@ pub struct NeonLocalInitConf {
|
||||
pub storage_controller: Option<NeonStorageControllerConf>,
|
||||
pub pageservers: Vec<NeonLocalInitPageserverConf>,
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub object_storage: ObjectStorageConf,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_hooks_api: Option<Url>,
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct ObjectStorageConf {
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
@@ -398,6 +411,10 @@ impl LocalEnv {
|
||||
self.pg_dir(pg_version, "lib")
|
||||
}
|
||||
|
||||
pub fn object_storage_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("object_storage")
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
@@ -431,6 +448,10 @@ impl LocalEnv {
|
||||
self.base_data_dir.join("safekeepers").join(data_dir_name)
|
||||
}
|
||||
|
||||
pub fn object_storage_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.join("object_storage")
|
||||
}
|
||||
|
||||
pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
|
||||
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
|
||||
Ok(conf)
|
||||
@@ -582,6 +603,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id,
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller,
|
||||
pageservers,
|
||||
@@ -591,6 +613,7 @@ impl LocalEnv {
|
||||
control_plane_compute_hook_api: _,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
base_data_dir: repopath.to_owned(),
|
||||
@@ -598,6 +621,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id,
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller,
|
||||
pageservers,
|
||||
@@ -606,6 +630,7 @@ impl LocalEnv {
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -705,6 +730,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir: self.neon_distrib_dir.clone(),
|
||||
default_tenant_id: self.default_tenant_id,
|
||||
private_key_path: self.private_key_path.clone(),
|
||||
public_key_path: self.public_key_path.clone(),
|
||||
broker: self.broker.clone(),
|
||||
storage_controller: self.storage_controller.clone(),
|
||||
pageservers: vec![], // it's skip_serializing anyway
|
||||
@@ -714,6 +740,7 @@ impl LocalEnv {
|
||||
control_plane_compute_hook_api: None,
|
||||
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||
generate_local_ssl_certs: self.generate_local_ssl_certs,
|
||||
object_storage: self.object_storage.clone(),
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -730,7 +757,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
|
||||
pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
@@ -797,6 +824,7 @@ impl LocalEnv {
|
||||
control_plane_api,
|
||||
generate_local_ssl_certs,
|
||||
control_plane_hooks_api,
|
||||
object_storage,
|
||||
} = conf;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -828,6 +856,7 @@ impl LocalEnv {
|
||||
)
|
||||
.context("generate auth keys")?;
|
||||
let private_key_path = PathBuf::from("auth_private_key.pem");
|
||||
let public_key_path = PathBuf::from("auth_public_key.pem");
|
||||
|
||||
// create the runtime type because the remaining initialization code below needs
|
||||
// a LocalEnv instance op operation
|
||||
@@ -838,6 +867,7 @@ impl LocalEnv {
|
||||
neon_distrib_dir,
|
||||
default_tenant_id: Some(default_tenant_id),
|
||||
private_key_path,
|
||||
public_key_path,
|
||||
broker,
|
||||
storage_controller: storage_controller.unwrap_or_default(),
|
||||
pageservers: pageservers.iter().map(Into::into).collect(),
|
||||
@@ -846,6 +876,7 @@ impl LocalEnv {
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings: Default::default(),
|
||||
generate_local_ssl_certs,
|
||||
object_storage,
|
||||
};
|
||||
|
||||
if generate_local_ssl_certs {
|
||||
@@ -873,8 +904,13 @@ impl LocalEnv {
|
||||
.context("pageserver init failed")?;
|
||||
}
|
||||
|
||||
ObjectStorage::from_env(&env)
|
||||
.init()
|
||||
.context("object storage init failed")?;
|
||||
|
||||
// setup remote remote location for default LocalFs remote storage
|
||||
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
|
||||
std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
|
||||
|
||||
env.persist_config()
|
||||
}
|
||||
@@ -944,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
|
||||
// -out rootCA.crt -keyout rootCA.key
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args([
|
||||
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
|
||||
"req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
|
||||
])
|
||||
.args(["-subj", "/CN=Neon Local CA"])
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
@@ -974,7 +1010,7 @@ fn generate_ssl_cert(
|
||||
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["req", "-new", "-nodes"])
|
||||
.args(["-newkey", "rsa:2048"])
|
||||
.args(["-newkey", "ed25519"])
|
||||
.args(["-subj", "/CN=localhost"])
|
||||
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
|
||||
107
control_plane/src/object_storage.rs
Normal file
107
control_plane/src/object_storage.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use crate::background_process::{self, start_process, stop_process};
|
||||
use crate::local_env::LocalEnv;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::{Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use std::io::Write;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
|
||||
pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
|
||||
|
||||
pub struct ObjectStorage {
|
||||
pub bin: Utf8PathBuf,
|
||||
pub data_dir: Utf8PathBuf,
|
||||
pub pemfile: Utf8PathBuf,
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
impl ObjectStorage {
|
||||
pub fn from_env(env: &LocalEnv) -> ObjectStorage {
|
||||
ObjectStorage {
|
||||
bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
|
||||
data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
|
||||
pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
|
||||
port: env.object_storage.port,
|
||||
}
|
||||
}
|
||||
|
||||
fn config_path(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.json")
|
||||
}
|
||||
|
||||
fn listen_addr(&self) -> Utf8PathBuf {
|
||||
format!("127.0.0.1:{}", self.port).into()
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
println!("Initializing object storage in {:?}", self.data_dir);
|
||||
let parent = self.data_dir.parent().unwrap();
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Cfg {
|
||||
listen: Utf8PathBuf,
|
||||
pemfile: Utf8PathBuf,
|
||||
local_path: Utf8PathBuf,
|
||||
r#type: String,
|
||||
}
|
||||
let cfg = Cfg {
|
||||
listen: self.listen_addr(),
|
||||
pemfile: parent.join(self.pemfile.clone()),
|
||||
local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
|
||||
r#type: "LocalFs".to_string(),
|
||||
};
|
||||
std::fs::create_dir_all(self.config_path().parent().unwrap())?;
|
||||
std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
|
||||
.context("write object storage config")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
|
||||
println!("Starting s3 proxy at {}", self.listen_addr());
|
||||
std::io::stdout().flush().context("flush stdout")?;
|
||||
|
||||
let process_status_check = || async {
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
let res = reqwest::Client::new()
|
||||
.get(format!("http://{}/metrics", self.listen_addr()))
|
||||
.send()
|
||||
.await;
|
||||
match res {
|
||||
Ok(response) if response.status().is_success() => Ok(true),
|
||||
Ok(_) => Err(anyhow!("Failed to query /metrics")),
|
||||
Err(e) => Err(anyhow!("Failed to check node status: {e}")),
|
||||
}
|
||||
};
|
||||
|
||||
let res = start_process(
|
||||
"object_storage",
|
||||
&self.data_dir.clone().into_std_path_buf(),
|
||||
&self.bin.clone().into_std_path_buf(),
|
||||
vec![self.config_path().to_string()],
|
||||
vec![("RUST_LOG".into(), "debug".into())],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
process_status_check,
|
||||
)
|
||||
.await;
|
||||
if res.is_err() {
|
||||
eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
stop_process(immediate, "object_storage", &self.pid_file())
|
||||
}
|
||||
|
||||
fn log_file(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.log")
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("object_storage.pid")
|
||||
}
|
||||
}
|
||||
@@ -535,6 +535,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
|
||||
gc_compaction_verification: settings
|
||||
.remove("gc_compaction_verification")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_verification' as bool")?,
|
||||
gc_compaction_initial_threshold_kb: settings
|
||||
.remove("gc_compaction_initial_threshold_kb")
|
||||
.map(|x| x.parse::<u64>())
|
||||
|
||||
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::models::{
|
||||
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
pub generation_override: Option<i32>,
|
||||
pub generation_override: Option<i32>, // only new tenants
|
||||
pub config: Option<TenantConfig>, // only new tenants
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -805,6 +808,7 @@ impl StorageController {
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
generation_override: None,
|
||||
config: None,
|
||||
};
|
||||
|
||||
let response = self
|
||||
|
||||
@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut node_to_fill_descs = Vec::new();
|
||||
|
||||
for desc in node_descs {
|
||||
let to_drain = nodes.iter().any(|id| *id == desc.id);
|
||||
let to_drain = nodes.contains(&desc.id);
|
||||
if to_drain {
|
||||
node_to_drain_descs.push(desc);
|
||||
} else {
|
||||
|
||||
@@ -11,8 +11,8 @@ generate_id() {
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
|
||||
SPEC_FILE=/tmp/spec.json
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
@@ -73,17 +73,27 @@ else
|
||||
ulid_extension=ulid
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
|
||||
cat ${SPEC_FILE}
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
# TODO(tristan957): Remove these workarounds for backwards compatibility after
|
||||
# the next compute release. That includes these next few lines and the
|
||||
# --spec-path in the compute_ctl invocation.
|
||||
if compute_ctl --help | grep --quiet -- '--config'; then
|
||||
SPEC_PATH="$CONFIG_FILE"
|
||||
else
|
||||
jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
|
||||
SPEC_PATH=/tmp/spec.json
|
||||
fi
|
||||
|
||||
echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
-S ${SPEC_FILE}
|
||||
--spec-path "$SPEC_PATH"
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
{
|
||||
"spec": {
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
},
|
||||
"compute_ctl_config": {
|
||||
"jwks": {
|
||||
"keys": []
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,141 +0,0 @@
|
||||
{
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
}
|
||||
@@ -159,7 +159,7 @@ services:
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
|
||||
- ./compute_wrapper/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
|
||||
242
docs/rfcs/2025-04-14-storage-keys.md
Normal file
242
docs/rfcs/2025-04-14-storage-keys.md
Normal file
@@ -0,0 +1,242 @@
|
||||
|
||||
# Storage Encryption Key Management
|
||||
|
||||
## Summary
|
||||
|
||||
As a precursor to adding new encryption capabilities to Neon's storage services, this RFC proposes
|
||||
mechanisms for creating and storing fine-grained encryption keys for user data in Neon. We aim
|
||||
to provide at least tenant granularity, but will use timeline granularity when it is simpler to do
|
||||
so.
|
||||
|
||||
Out of scope:
|
||||
- We describe an abstract KMS interface, but not particular platform implementations (such as how
|
||||
to authenticate with KMS).
|
||||
|
||||
## Terminology
|
||||
|
||||
_wrapped/unwrapped_: a wrapped encryption key is a key encrypted by another key. For example, the key for
|
||||
encrypting a timeline's pageserver data might be wrapped by some "root" key for the tenant's user account, stored in a KMS system.
|
||||
|
||||
_key hierarchy_: the relationships between keys which wrap each other. For example, a layer file key might
|
||||
be wrapped by a pageserver tenant key, which is wrapped by a tenant's root key.
|
||||
|
||||
## Design Choices
|
||||
|
||||
Storage: S3 will be the store of record for wrapped keys.
|
||||
|
||||
Separate keys: Safekeeper and Pageserver will use independent keys.
|
||||
|
||||
AES256: rather than building a generic system for keys, we will assume that all the keys
|
||||
we manage are AES256 keys -- this is the de-facto standard for enterprise data storage.
|
||||
|
||||
Per-object keys: rather than encrypting data objects (layer files and segment files) with
|
||||
the tenant keys directly, they will be encrypted with separate keys. This avoids cryptographic
|
||||
safety issues from re-using the same key for large quantities of potentially repetitive plaintext.
|
||||
|
||||
S3 objects are self-contained: each encrypted file will have a metadata block in the file itself
|
||||
storing the KMS-wrapped key to decrypt itself.
|
||||
|
||||
Key storage is optional at a per-tenant granularity: eventually this would be on by default, but:
|
||||
- initially only some environments will have a KMS set up.
|
||||
- Encryption has some overhead and it may be that some tenants don't want or need it.
|
||||
|
||||
## Design
|
||||
|
||||
### Summary of format changes
|
||||
|
||||
- Pageserver layer files and safekeeper segment objects are split into blocks and each
|
||||
block is encrypted by the layer key.
|
||||
- Pageserver layer files and safekeeper segment objects get new metadata fields to
|
||||
store wrapped layer key and the KMS-wrapped timeline key.
|
||||
|
||||
### Summary of API changes
|
||||
|
||||
- Pageserver TenantConf API gets a new field for account ID
|
||||
- Pageserver TenantConf API gets a new field for encryption mode
|
||||
- Safekeeper timeline creation API gets a new field for account ID
|
||||
- Controller, pageserver & safekeeper get a new timeline-scoped `rotate_key` API
|
||||
|
||||
### KMS interface
|
||||
|
||||
Neon will interoperate with different KMS APIs on different platforms. We will implement a generic interface,
|
||||
similar to how `remote_storage` wraps different object storage APIs:
|
||||
- `generate(accountId, keyType, alias) -> (wrapped key, plaintext key)`
|
||||
- `unwrap(accountId, ciphertext key) -> plaintext key`
|
||||
|
||||
Hereafter, when we talk about generating or unwrapping a key, this means a call into the KMS API.
|
||||
|
||||
The KMS deals with abstract "account IDs", which are not equal to tenant IDs and may not be
|
||||
1:1 with tenants. The account ID will be provided as part of tenant configuration, along
|
||||
with a field to identify an encryption mode.
|
||||
|
||||
|
||||
### Pageserver Layer File Format
|
||||
|
||||
Encryption blocks are the minimum of unit of read. To read the part of the data within the encryption block
|
||||
we must decrypt the whole block. All encryption blocks share the same layer key within the layer (is this safe?).
|
||||
|
||||
Image layers: each image is one encryption block.
|
||||
|
||||
Delta layers: for the first stage of the project, each delta is encrypted separately; in the future, we can batch
|
||||
several small deltas into a single encryption block.
|
||||
|
||||
Indicies: each B+ tree node is an encryption block.
|
||||
|
||||
Layer format:
|
||||
|
||||
```
|
||||
| Data Block | Data Block | Data Block | ... | Index Block | Index Block | Index Block | Metadata |
|
||||
Data block = encrypt(data, layer_key)
|
||||
Index block = encrypt(index, layer_key); index points a key to a offset of the data block inside the layer file.
|
||||
Metadata = wrap(layer_key, timeline_key), wrap_kms(tenant_key), and other metadata we want to store in the future
|
||||
```
|
||||
|
||||
Note that we generate a random layer_key for each of the layer. We store the layer key wrapped by the current
|
||||
tenant key (described in later sections) and the KMS-wrapped tenant key in the layer.
|
||||
|
||||
If data compression is enabled, the data is compressed first before being encrypted (is this safe?)
|
||||
|
||||
This file format is used across both object storage and local storage. We do not decrypt when downloading
|
||||
the layer file to the disk. Decryption is done when reading the layer.
|
||||
|
||||
### Layer File Format Migration
|
||||
|
||||
We record the file format for each of the layer file in both the index_part and the layer file name (suffix v2?).
|
||||
The layer file format version will be passed into the layer readers. The re-keying operation (described below)
|
||||
will migrate all layer files automatically to v2.
|
||||
|
||||
### Safekeeper Segment Format
|
||||
|
||||
TBD
|
||||
|
||||
### Pageserver Timeline Index
|
||||
|
||||
We will add a `created_at` for each of the layer file so that during re-keying (described in later sections)
|
||||
we can determine which layer files to rewrite. We also record the offset of the metadata block so that it is
|
||||
possible to obtain more information about the layer file without downloading the full layer file (i.e., the
|
||||
exact timeline key being used to encrypt the layer file).
|
||||
|
||||
```
|
||||
# LayerFileMetadata
|
||||
{
|
||||
"format": 2,
|
||||
"created_at": "<time>",
|
||||
"metadata_block_offset": u64,
|
||||
}
|
||||
```
|
||||
|
||||
TODO: create an index for safekeeper so that it's faster to determine what files to re-key? Or we can scan all
|
||||
files.
|
||||
|
||||
### Pageserver Key Cache
|
||||
|
||||
We have a hashmap from KMS-wrapped tenant key to plain key for each of the tenant so that we do not need to repeatly
|
||||
unwrap the same key.
|
||||
|
||||
### Key rotation
|
||||
|
||||
Each tenant stores a tenant key in memory to encrypt all layer files generated across all timelines within
|
||||
its active period. When the key rotation API gets called, we rotate the timeline key in memory by calling the
|
||||
KMS API to generate a new key-pair, and all new layer files' layer keys will be encrypted using this key.
|
||||
|
||||
### Re-keying
|
||||
|
||||
While re-keying and key-rotation are sometimes used synonymously, we distinguish them:
|
||||
- Key rotation is generating a new key to use for new data
|
||||
- Re-keying is rewriting existing data so that old keys are no longer used at all
|
||||
|
||||
Re-keying is a bulk data operation, and not fully defined in this RFC: it can be defined
|
||||
quite simply as "For object in objects, if object key version is < the rekeying horizon,
|
||||
then do a read/write cycle on the object using latest key". This is a simple but potentially very
|
||||
expensive operation, so we discuss efficiency here.
|
||||
|
||||
#### Pageserver re-key
|
||||
|
||||
For pageservers, occasional rekeying may be implemented efficiently if one tolerates using
|
||||
the last few keys and doesn't insist on the latest, because pageservers periodically rewrite
|
||||
their data for GC-compaction anyway. Thereby an API call to re-key any data with an overly old
|
||||
key would often be a no-op because all data was rewritten recently anyway.
|
||||
|
||||
When object versioning is enabled in storage, re-keying is not fully accomplished by just
|
||||
re-writing live data: old versions would still contain user data encrypted with older keys. To
|
||||
fully re-key, an extra step is needed to purge old objects. Ideally, we should only purge
|
||||
old objects which were encrypted using old keys. To this end, it would be useful to store
|
||||
the encryption key version as metadata on objects, so that a scrub of deleted object versions
|
||||
can efficiently select those objects that should be purged during re-key.
|
||||
|
||||
Checks on object versions should not only be on deleted objects: because pageserver can emit
|
||||
"orphan" objects not referenced in the index under some circumstances, re-key must also
|
||||
check non-deleted objects.
|
||||
|
||||
To summarize, the pageserver re-key operation is:
|
||||
- Iterate over index of layer files, select those with too-old key and rewrite them
|
||||
- Iterate over all versions in object storage, select those with a too-old key version
|
||||
in their metadata and purge them (with a safety check that these are not referenced
|
||||
by the latest index).
|
||||
|
||||
It would be wise to combine the re-key procedure with an exhaustive read of a timeline's data,
|
||||
to ensure that when testing & rolling this feature out we are not rendering anything unreadable
|
||||
due to bugs in implementation. Since we are deleting old versions in object storage, our
|
||||
time travel recovery tool will not be any help if we get something wrong in this process.
|
||||
|
||||
#### Safekeeper re-key
|
||||
|
||||
Re-keying a safekeeper timeline requires an exhaustive walk of segment objects, read
|
||||
metadata on each one and decide whether it requires rewrite.
|
||||
|
||||
Safekeeper currently keeps historic objects forever, so re-keying this data will get
|
||||
more expensive as time goes on. This would be a good time to add cleanup of old safekeeper
|
||||
segments, but doing so is beyond the scope of this RFC.
|
||||
|
||||
### Enabling encryption for existing tenants
|
||||
|
||||
To enable encryption for an existing tenant, we may simply call key-rotation API (to generate a key),
|
||||
and then re-key API (to rewrite existing data using this key).
|
||||
|
||||
## Observability
|
||||
|
||||
- To enable some external service to implement re-keying, we should publish metrics per-timeline
|
||||
on the age of their latest encryption key.
|
||||
- Calls to KMS should be tracked with typical request rate/result/latency histograms to enable
|
||||
detection of a slow KMS server and/or errors.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
### Use same tenant key for safekeeper and pageserver
|
||||
|
||||
We could halve the number of keys in circulation by having the safekeeper and pageserver
|
||||
share a key rather than working independently.
|
||||
|
||||
However, this would be substantially more complex to implement, as safekeepers and pageservers
|
||||
currently share no storage, so some new communication path would be needed. There is minimal
|
||||
upside in sharing a key.
|
||||
|
||||
### No KMS dependency
|
||||
|
||||
We could choose to do all key management ourselves. However, the industry standard approach
|
||||
to enabling users of cloud SaaS software to self-manage keys is to use the KMS as the intermediary
|
||||
between our system and the user's control of their key. Although this RFC does not propose user-managed keys, we should design with this in mind.
|
||||
|
||||
### Do all key generation/wrapping in KMS service
|
||||
|
||||
We could avoid generating and wrapping/unwrapping object keys in our storage
|
||||
services by delegating all responsibility for key operations to the KMS. However,
|
||||
KMS services have limited throughput and in some cases may charge per operation, so
|
||||
it is useful to avoid doing KMS operations per-object, and restrict them to per-timeline
|
||||
frequency.
|
||||
|
||||
### Per-tenant instead of per-timeline pageserver keys
|
||||
|
||||
For tenants with many timelines, we may reduce load on KMS service by
|
||||
using per-tenant instead of per-timeline keys, so that we may do operations
|
||||
such as creating a timeline without needing to do a KMS unwrap operation.
|
||||
|
||||
However, per-timeline key management is much simpler to implement on the safekeeper,
|
||||
which currently has no concept of a tenant (other than as a namespace for timelines).
|
||||
It is also slightly simpler to implement on the pageserver, as it avoids implementing
|
||||
a tenant-scoped creation operation to initialize keys (instead, we may initialize keys
|
||||
during timeline creation).
|
||||
|
||||
As a side benefit, per-timeline key management also enables implementing secure deletion in future
|
||||
at a per-timeline granularity.
|
||||
|
||||
@@ -151,7 +151,7 @@ Example body:
|
||||
```
|
||||
{
|
||||
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
||||
"stripe_size": 32768,
|
||||
"stripe_size": 2048,
|
||||
"shards": [
|
||||
{"node_id": 344, "shard_number": 0},
|
||||
{"node_id": 722, "shard_number": 1},
|
||||
|
||||
@@ -5,6 +5,14 @@ use crate::privilege::Privilege;
|
||||
use crate::responses::ComputeCtlConfig;
|
||||
use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
|
||||
|
||||
/// When making requests to the `compute_ctl` external HTTP server, the client
|
||||
/// must specify a set of claims in `Authorization` header JWTs such that
|
||||
/// `compute_ctl` can authorize the request.
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeClaims {
|
||||
pub compute_id: String,
|
||||
}
|
||||
|
||||
/// Request of the /configure API
|
||||
///
|
||||
/// We now pass only `spec` in the configuration request, but later we can
|
||||
@@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest {
|
||||
pub privileges: Vec<Privilege>,
|
||||
pub role: PgIdent,
|
||||
}
|
||||
|
||||
/// Request of the /configure_telemetry API
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ConfigureTelemetryRequest {
|
||||
pub logs_export_host: Option<String>,
|
||||
}
|
||||
|
||||
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// All configuration parameters necessary for a compute. When
|
||||
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
|
||||
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
|
||||
/// and contains parameters necessary for operating `compute_ctl` independently
|
||||
/// of whether a tenant is attached to the compute or not.
|
||||
///
|
||||
/// This also happens to be the body of `compute_ctl`'s /configure request.
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeConfig {
|
||||
/// The compute spec
|
||||
pub spec: Option<ComputeSpec>,
|
||||
|
||||
/// The compute_ctl configuration
|
||||
#[allow(dead_code)]
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
impl From<ControlPlaneConfigResponse> for ComputeConfig {
|
||||
fn from(value: ControlPlaneConfigResponse) -> Self {
|
||||
Self {
|
||||
spec: value.spec,
|
||||
compute_ctl_config: value.compute_ctl_config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ExtensionInstallResponse {
|
||||
pub extension: PgIdent,
|
||||
@@ -161,7 +187,7 @@ pub struct TlsConfig {
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub struct ControlPlaneConfigResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `ComputeSpec` represents the contents of the spec.json file.
|
||||
//!
|
||||
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
|
||||
//! all the information needed to start up the right version of PostgreSQL,
|
||||
//! and connect it to the storage nodes.
|
||||
//! The ComputeSpec contains all the information needed to start up
|
||||
//! the right version of PostgreSQL, and connect it to the storage nodes.
|
||||
//! It can be passed as part of the `config.json`, or the control plane can
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
pub branch_id: Option<String>,
|
||||
pub endpoint_id: Option<String>,
|
||||
|
||||
/// Safekeeper membership config generation. It is put in
|
||||
/// neon.safekeepers GUC and serves two purposes:
|
||||
/// 1) Non zero value forces walproposer to use membership configurations.
|
||||
@@ -159,15 +165,13 @@ pub struct ComputeSpec {
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
|
||||
/// Log level for audit logging:
|
||||
///
|
||||
/// Disabled - no audit logging. This is the default.
|
||||
/// log - log masked statements to the postgres log using pgaudit extension
|
||||
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
|
||||
///
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
/// Log level for compute audit logging
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
|
||||
/// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
|
||||
/// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
|
||||
pub logs_export_host: Option<String>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -179,9 +183,6 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Allow to configure rsyslog for Postgres logs export
|
||||
PostgresLogsExport,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
@@ -288,14 +289,25 @@ impl ComputeMode {
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
/// Disabled, log, hipaa
|
||||
/// Default is Disabled
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
#[default]
|
||||
Disabled,
|
||||
// Deprecated, use Base instead
|
||||
Log,
|
||||
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
|
||||
// logged to the standard postgresql log stream
|
||||
Base,
|
||||
// Deprecated, use Full or Extended instead
|
||||
Hipaa,
|
||||
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access
|
||||
Extended,
|
||||
// (pgaudit.log='all', pgaudit.log_parameter='on'),
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access.
|
||||
Full,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
|
||||
@@ -30,6 +30,7 @@ tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
|
||||
@@ -4,6 +4,8 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use hyper0::Body;
|
||||
use hyper0::server::conn::Http;
|
||||
use metrics::{IntCounterVec, register_int_counter_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::{RequestService, RequestServiceBuilder};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
@@ -26,6 +28,24 @@ pub struct Server {
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
}
|
||||
|
||||
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_started_total",
|
||||
"Number of established http/https connections",
|
||||
&["scheme"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_errors_total",
|
||||
"Number of occured connection errors by type",
|
||||
&["type"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl Server {
|
||||
pub fn new(
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
@@ -60,6 +80,15 @@ impl Server {
|
||||
false
|
||||
}
|
||||
|
||||
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
|
||||
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
|
||||
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
|
||||
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
|
||||
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
|
||||
|
||||
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
|
||||
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
|
||||
|
||||
let mut connections = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -67,6 +96,7 @@ impl Server {
|
||||
let (tcp_stream, remote_addr) = match stream {
|
||||
Ok(stream) => stream,
|
||||
Err(err) => {
|
||||
tcp_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TCP connection: {err:#}");
|
||||
}
|
||||
@@ -78,11 +108,18 @@ impl Server {
|
||||
let tls_acceptor = self.tls_acceptor.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
let tls_error_cnt = tls_error_cnt.clone();
|
||||
let http_error_cnt = http_error_cnt.clone();
|
||||
let https_error_cnt = https_error_cnt.clone();
|
||||
let http_connection_cnt = http_connection_cnt.clone();
|
||||
let https_connection_cnt = https_connection_cnt.clone();
|
||||
|
||||
connections.push(tokio::spawn(
|
||||
async move {
|
||||
match tls_acceptor {
|
||||
Some(tls_acceptor) => {
|
||||
// Handle HTTPS connection.
|
||||
https_connection_cnt.inc();
|
||||
let tls_stream = tokio::select! {
|
||||
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
|
||||
_ = cancel.cancelled() => return,
|
||||
@@ -90,6 +127,7 @@ impl Server {
|
||||
let tls_stream = match tls_stream {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
tls_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
@@ -97,6 +135,7 @@ impl Server {
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
https_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
@@ -104,7 +143,9 @@ impl Server {
|
||||
}
|
||||
None => {
|
||||
// Handle HTTP connection.
|
||||
http_connection_cnt.inc();
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
http_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
@@ -115,6 +156,7 @@ impl Server {
|
||||
}
|
||||
Some(conn) = connections.next() => {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
@@ -122,6 +164,7 @@ impl Server {
|
||||
// Wait for graceful shutdown of all connections.
|
||||
while let Some(conn) = connections.next().await {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use rustls::{
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
|
||||
server::{ClientHello, ResolvesServerCert},
|
||||
sign::CertifiedKey,
|
||||
};
|
||||
use x509_cert::der::Reader;
|
||||
|
||||
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
|
||||
let cert_data = tokio::fs::read(filename)
|
||||
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
|
||||
Ok(certified_key)
|
||||
}
|
||||
|
||||
/// rustls's CertifiedKey with extra parsed fields used for metrics.
|
||||
struct ParsedCertifiedKey {
|
||||
certified_key: CertifiedKey,
|
||||
expiration_time: UnixTime,
|
||||
}
|
||||
|
||||
/// Parse expiration time from an X509 certificate.
|
||||
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
|
||||
let parsed_cert = x509_cert::der::SliceReader::new(cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
Ok(UnixTime::since_unix_epoch(
|
||||
parsed_cert
|
||||
.tbs_certificate
|
||||
.validity
|
||||
.not_after
|
||||
.to_unix_duration(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn load_and_parse_certified_key(
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
) -> anyhow::Result<ParsedCertifiedKey> {
|
||||
let certified_key = load_certified_key(key_filename, cert_filename).await?;
|
||||
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
|
||||
Ok(ParsedCertifiedKey {
|
||||
certified_key,
|
||||
expiration_time,
|
||||
})
|
||||
}
|
||||
|
||||
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"tls_certs_expiration_time_seconds",
|
||||
"Expiration time of the loaded certificate since unix epoch in seconds",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_started_total",
|
||||
"Number of certificate reload loop iterations started",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_updated_total",
|
||||
"Number of times the certificate was updated to the new one",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_failed_total",
|
||||
"Number of times the certificate reload failed",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
|
||||
/// the disk periodically.
|
||||
#[derive(Debug)]
|
||||
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
|
||||
impl ReloadingCertificateResolver {
|
||||
/// Creates a new Resolver by loading certificate and private key from FS and
|
||||
/// creating tokio::task to reload them with provided reload_period.
|
||||
/// resolver_name is used as metric's label.
|
||||
pub async fn new(
|
||||
resolver_name: &str,
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
reload_period: Duration,
|
||||
) -> anyhow::Result<Arc<Self>> {
|
||||
// Create metrics for current resolver.
|
||||
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
|
||||
let cert_reload_started_counter =
|
||||
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_updated_counter =
|
||||
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_failed_counter =
|
||||
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
|
||||
|
||||
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
|
||||
|
||||
let this = Arc::new(Self {
|
||||
certified_key: ArcSwap::from_pointee(
|
||||
load_certified_key(key_filename, cert_filename).await?,
|
||||
),
|
||||
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
|
||||
});
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
|
||||
tokio::spawn({
|
||||
let weak_this = Arc::downgrade(&this);
|
||||
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
|
||||
Some(this) => this,
|
||||
None => break, // Resolver has been destroyed, exit.
|
||||
};
|
||||
match load_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(new_certified_key) => {
|
||||
if new_certified_key.cert == this.certified_key.load().cert {
|
||||
cert_reload_started_counter.inc();
|
||||
|
||||
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(parsed_key) => {
|
||||
if parsed_key.certified_key.cert == this.certified_key.load().cert {
|
||||
tracing::debug!("Certificate has not changed since last reloading");
|
||||
} else {
|
||||
tracing::info!("Certificate has been reloaded");
|
||||
this.certified_key.store(Arc::new(new_certified_key));
|
||||
this.certified_key.store(Arc::new(parsed_key.certified_key));
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
cert_reload_updated_counter.inc();
|
||||
}
|
||||
last_reload_failed = false;
|
||||
}
|
||||
Err(err) => {
|
||||
cert_reload_failed_counter.inc();
|
||||
// Note: Reloading certs may fail if it conflicts with the script updating
|
||||
// the files at the same time. Warn only if the error is persistent.
|
||||
if last_reload_failed {
|
||||
|
||||
@@ -180,6 +180,7 @@ pub struct ConfigToml {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generate_unarchival_heatmap: Option<bool>,
|
||||
pub tracing: Option<Tracing>,
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -206,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
|
||||
/// Causes runtime errors if larger than max get_vectored batch size.
|
||||
pub max_batch_size: NonZeroUsize,
|
||||
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
|
||||
// The default below is such that new versions of the software can start
|
||||
// with the old configuration.
|
||||
#[serde(default)]
|
||||
pub batching: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -215,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
|
||||
Tasks,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum PageServiceProtocolPipelinedBatchingStrategy {
|
||||
/// All get page requests in a batch will be at the same LSN
|
||||
#[default]
|
||||
UniformLsn,
|
||||
/// Get page requests in a batch may be at different LSN
|
||||
///
|
||||
/// One key cannot be present more than once at different LSNs in
|
||||
/// the same batch.
|
||||
ScatteredLsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case")]
|
||||
pub enum GetVectoredConcurrentIo {
|
||||
@@ -451,6 +469,8 @@ pub struct TenantConfigToml {
|
||||
// gc-compaction related configs
|
||||
/// Enable automatic gc-compaction trigger on this tenant.
|
||||
pub gc_compaction_enabled: bool,
|
||||
/// Enable verification of gc-compaction results.
|
||||
pub gc_compaction_verification: bool,
|
||||
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
|
||||
/// gc-compaction will be triggered.
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
@@ -612,9 +632,12 @@ impl Default for ConfigToml {
|
||||
page_service_pipelining: if !cfg!(test) {
|
||||
PageServicePipeliningConfig::Serial
|
||||
} else {
|
||||
// Do not turn this into the default until scattered reads have been
|
||||
// validated and rolled-out fully.
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
|
||||
batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
|
||||
})
|
||||
},
|
||||
get_vectored_concurrent_io: if !cfg!(test) {
|
||||
@@ -631,6 +654,7 @@ impl Default for ConfigToml {
|
||||
load_previous_heatmap: None,
|
||||
generate_unarchival_heatmap: None,
|
||||
tracing: None,
|
||||
enable_tls_page_service_api: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -690,6 +714,7 @@ pub mod tenant_conf_defaults {
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
|
||||
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
|
||||
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
|
||||
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
|
||||
}
|
||||
@@ -744,6 +769,7 @@ impl Default for TenantConfigToml {
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
|
||||
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
|
||||
sampling_ratio: None,
|
||||
|
||||
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`storage_controller::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Import request for safekeeper timelines.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineImportRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub start_lsn: Lsn,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json;
|
||||
|
||||
@@ -927,7 +927,7 @@ impl Key {
|
||||
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
@@ -938,7 +938,7 @@ impl Key {
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
_ => return Err(ToRelBlockError(self.field1)),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ToRelBlockError(u8);
|
||||
|
||||
impl fmt::Display for ToRelBlockError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "unexpected value kind 0x{:02x}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ToRelBlockError {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -613,8 +613,7 @@ mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use super::*;
|
||||
use crate::models::ShardParameters;
|
||||
use crate::shard::{ShardCount, ShardNumber};
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};
|
||||
|
||||
// Helper function to create a key range.
|
||||
//
|
||||
@@ -964,12 +963,8 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn sharded_range_relation_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -985,12 +980,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_single_key() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -1034,12 +1025,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_forkno_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
@@ -1061,7 +1048,7 @@ mod tests {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -1144,37 +1131,44 @@ mod tests {
|
||||
/// for a single tenant.
|
||||
#[test]
|
||||
fn sharded_range_fragment_simple() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which we happen to know covers exactly one stripe which belongs to this shard
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
|
||||
let mut input_end = input_start;
|
||||
input_end.field6 += STRIPE_SIZE; // field6 is block number
|
||||
|
||||
// Ask for stripe_size blocks, we get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 32768),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for more, we still get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 10000000),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for target_nblocks of half the stripe size, we get two halves
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16384),
|
||||
do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
|
||||
(
|
||||
32768,
|
||||
STRIPE_SIZE,
|
||||
vec![
|
||||
(16384, input_start..input_start.add(16384)),
|
||||
(16384, input_start.add(16384)..input_end)
|
||||
(
|
||||
STRIPE_SIZE / 2,
|
||||
input_start..input_start.add(STRIPE_SIZE / 2)
|
||||
),
|
||||
(STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
|
||||
]
|
||||
)
|
||||
);
|
||||
@@ -1182,40 +1176,53 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_multi_stripe() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let mut input_end = input_start;
|
||||
input_end.field6 += RANGE_SIZE; // field6 is block number
|
||||
|
||||
// Ask for all the blocks, get a fragment that covers the whole range but reports
|
||||
// its size to be just the blocks belonging to our shard.
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 131072),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
|
||||
(STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for a sub-stripe quantity
|
||||
// Ask for a sub-stripe quantity that results in 3 fragments.
|
||||
let limit = STRIPE_SIZE / 3 + 1;
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16000),
|
||||
do_fragment(input_start, input_end, &shard_identity, limit),
|
||||
(
|
||||
32768,
|
||||
STRIPE_SIZE,
|
||||
vec![
|
||||
(16000, input_start..input_start.add(16000)),
|
||||
(16000, input_start.add(16000)..input_start.add(32000)),
|
||||
(768, input_start.add(32000)..input_end),
|
||||
(limit, input_start..input_start.add(limit)),
|
||||
(limit, input_start.add(limit)..input_start.add(2 * limit)),
|
||||
(
|
||||
STRIPE_SIZE - 2 * limit,
|
||||
input_start.add(2 * limit)..input_end
|
||||
),
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
// Try on a range that starts slightly after our owned stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
|
||||
(32767, vec![(32767, input_start.add(1)..input_end)])
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
|
||||
(
|
||||
STRIPE_SIZE - 1,
|
||||
vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1223,32 +1230,40 @@ mod tests {
|
||||
/// a previous relation.
|
||||
#[test]
|
||||
fn sharded_range_fragment_starting_from_logical_size() {
|
||||
const SHARD_COUNT: u8 = 4;
|
||||
const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
|
||||
const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
|
||||
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
|
||||
let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
|
||||
input_end.field6 += RANGE_SIZE; // field6 is block number
|
||||
|
||||
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x8001, vec![(0x8001, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
|
||||
(
|
||||
STRIPE_SIZE + 1,
|
||||
vec![(STRIPE_SIZE + 1, input_start..input_end)]
|
||||
)
|
||||
);
|
||||
|
||||
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
|
||||
// store all logical sizes)
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
ShardCount::new(SHARD_COUNT),
|
||||
ShardStripeSize(STRIPE_SIZE),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x1, vec![(0x1, input_start..input_end)])
|
||||
do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
|
||||
(1, vec![(1, input_start..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1284,12 +1299,8 @@ mod tests {
|
||||
);
|
||||
|
||||
// Same, but using a sharded identity
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
let shard_identity =
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
@@ -1331,7 +1342,7 @@ mod tests {
|
||||
ShardIdentity::new(
|
||||
ShardNumber((prng.next_u32() % shard_count) as u8),
|
||||
ShardCount::new(shard_count as u8),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
|
||||
use crate::config::Ratio;
|
||||
use crate::key::{CompactKey, Key};
|
||||
use crate::reltag::RelTag;
|
||||
use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -80,10 +80,22 @@ pub enum TenantState {
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_broken()`.
|
||||
Stopping {
|
||||
/// The barrier can be used to wait for shutdown to complete. The first caller to set
|
||||
/// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers
|
||||
/// will wait for the first caller's existing barrier.
|
||||
///
|
||||
/// None is set when an attach is cancelled, to signal to shutdown that the attach has in
|
||||
/// fact cancelled:
|
||||
///
|
||||
/// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant.
|
||||
/// 2. `attach` sets `TenantState::Stopping(None)` and exits.
|
||||
/// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets
|
||||
/// `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner.
|
||||
//
|
||||
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
|
||||
// otherwise it will not be skipped during deserialization
|
||||
#[serde(skip)]
|
||||
progress: completion::Barrier,
|
||||
progress: Option<completion::Barrier>,
|
||||
},
|
||||
/// The tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations.
|
||||
@@ -426,8 +438,6 @@ pub struct ShardParameters {
|
||||
}
|
||||
|
||||
impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count.is_unsharded()
|
||||
}
|
||||
@@ -437,7 +447,7 @@ impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount::new(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -566,6 +576,8 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_verification: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_ratio_percent: FieldPatch<u64>,
|
||||
@@ -686,6 +698,9 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_enabled: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_verification: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_initial_threshold_kb: Option<u64>,
|
||||
|
||||
@@ -734,6 +749,7 @@ impl TenantConfig {
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
mut sampling_ratio,
|
||||
@@ -825,6 +841,9 @@ impl TenantConfig {
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
.apply(&mut gc_compaction_enabled);
|
||||
patch
|
||||
.gc_compaction_verification
|
||||
.apply(&mut gc_compaction_verification);
|
||||
patch
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.apply(&mut gc_compaction_initial_threshold_kb);
|
||||
@@ -866,6 +885,7 @@ impl TenantConfig {
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
sampling_ratio,
|
||||
@@ -964,6 +984,9 @@ impl TenantConfig {
|
||||
gc_compaction_enabled: self
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(global_conf.gc_compaction_enabled),
|
||||
gc_compaction_verification: self
|
||||
.gc_compaction_verification
|
||||
.unwrap_or(global_conf.gc_compaction_verification),
|
||||
gc_compaction_initial_threshold_kb: self
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
|
||||
@@ -1668,6 +1691,7 @@ pub struct SecondaryProgress {
|
||||
pub struct TenantScanRemoteStorageShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub generation: Option<u32>,
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
@@ -2719,10 +2743,15 @@ mod tests {
|
||||
"Activating",
|
||||
),
|
||||
(line!(), TenantState::Active, "Active"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Stopping { progress: None },
|
||||
"Stopping",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Stopping {
|
||||
progress: utils::completion::Barrier::default(),
|
||||
progress: Some(completion::Barrier::default()),
|
||||
},
|
||||
"Stopping",
|
||||
),
|
||||
|
||||
@@ -58,6 +58,8 @@ pub enum NeonWalRecord {
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
/// Only append the record if the current image is the same as the one specified in this field.
|
||||
only_if: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -81,6 +83,17 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
only_if: Some(only_if.as_ref().to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,6 +103,7 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,6 +113,7 @@ impl NeonWalRecord {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
only_if: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardStripeSize {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
@@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
|
||||
///
|
||||
/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
|
||||
/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
@@ -537,7 +546,7 @@ mod tests {
|
||||
field6: 0x7d06,
|
||||
};
|
||||
|
||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||
let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
|
||||
assert_eq!(shard, ShardNumber(8));
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use std::future::Future;
|
||||
use std::io::ErrorKind;
|
||||
use std::net::SocketAddr;
|
||||
use std::os::fd::{AsRawFd, RawFd};
|
||||
use std::pin::Pin;
|
||||
@@ -227,7 +226,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
Err(io::Error::other("reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
@@ -237,7 +236,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
Err(io::Error::other("reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
@@ -975,7 +974,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
|
||||
.write_message_noflush(&BeMessage::CopyData(buf))
|
||||
// write_message only writes to the buffer, so it can fail iff the
|
||||
// message is invaid, but CopyData can't be invalid.
|
||||
.map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
|
||||
.map_err(|_| io::Error::other("failed to serialize CopyData"))?;
|
||||
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
|
||||
|
||||
static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
|
||||
cert
|
||||
|
||||
rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
|
||||
});
|
||||
|
||||
// test that basic select with ssl works
|
||||
|
||||
@@ -35,7 +35,7 @@ impl ConnectionError {
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
match self {
|
||||
ConnectionError::Io(io) => io,
|
||||
ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
|
||||
ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,7 +257,7 @@ pub enum ProtocolError {
|
||||
impl ProtocolError {
|
||||
/// Proxy stream.rs uses only io::Error; provide it.
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, self.to_string())
|
||||
io::Error::other(self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -212,7 +212,7 @@ impl ScramSha256 {
|
||||
password,
|
||||
channel_binding,
|
||||
} => (nonce, password, channel_binding),
|
||||
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
|
||||
_ => return Err(io::Error::other("invalid SCRAM state")),
|
||||
};
|
||||
|
||||
let message =
|
||||
@@ -291,7 +291,7 @@ impl ScramSha256 {
|
||||
server_key,
|
||||
auth_message,
|
||||
} => (server_key, auth_message),
|
||||
_ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
|
||||
_ => return Err(io::Error::other("invalid SCRAM state")),
|
||||
};
|
||||
|
||||
let message =
|
||||
@@ -301,10 +301,7 @@ impl ScramSha256 {
|
||||
|
||||
let verifier = match parsed {
|
||||
ServerFinalMessage::Error(e) => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("SCRAM error: {}", e),
|
||||
));
|
||||
return Err(io::Error::other(format!("SCRAM error: {}", e)));
|
||||
}
|
||||
ServerFinalMessage::Verifier(verifier) => verifier,
|
||||
};
|
||||
|
||||
@@ -28,7 +28,7 @@ toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
utils = { path = "../utils", default-features = false }
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
azure_core.workspace = true
|
||||
|
||||
@@ -801,8 +801,7 @@ where
|
||||
// that support needs to be hacked in.
|
||||
//
|
||||
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::other(
|
||||
"cloned or initial values cannot be read",
|
||||
))),
|
||||
}
|
||||
@@ -855,7 +854,7 @@ where
|
||||
};
|
||||
Err(azure_core::error::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||
std::io::Error::other(msg),
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
default = ["rename_noreplace"]
|
||||
rename_noreplace = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
@@ -35,7 +36,7 @@ serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = ["serde"] }
|
||||
|
||||
@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
|
||||
pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
|
||||
let key = EncodingKey::from_ed_pem(key_data)?;
|
||||
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
@@ -81,12 +81,9 @@ pub fn path_with_suffix_extension(
|
||||
}
|
||||
|
||||
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
|
||||
let parent = file_path.parent().ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("File {file_path:?} has no parent"),
|
||||
)
|
||||
})?;
|
||||
let parent = file_path
|
||||
.parent()
|
||||
.ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;
|
||||
|
||||
fsync(file_path)?;
|
||||
fsync(parent)?;
|
||||
|
||||
@@ -3,7 +3,9 @@ use std::{fs, io, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
#[cfg(feature = "rename_noreplace")]
|
||||
mod rename_noreplace;
|
||||
#[cfg(feature = "rename_noreplace")]
|
||||
pub use rename_noreplace::rename_noreplace;
|
||||
|
||||
pub trait PathExt {
|
||||
|
||||
@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
dst: &P2,
|
||||
) -> nix::Result<()> {
|
||||
{
|
||||
#[cfg(target_os = "linux")]
|
||||
#[cfg(all(target_os = "linux", target_env = "gnu"))]
|
||||
{
|
||||
nix::fcntl::renameat2(
|
||||
None,
|
||||
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
})??;
|
||||
nix::errno::Errno::result(res).map(drop)
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
|
||||
{
|
||||
std::compile_error!("OS does not support no-replace renames");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
pub use signal_hook::consts::TERM_SIGNALS;
|
||||
pub use signal_hook::consts::signal::*;
|
||||
use signal_hook::iterator::Signals;
|
||||
use tokio::signal::unix::{SignalKind, signal};
|
||||
use tracing::info;
|
||||
|
||||
pub enum Signal {
|
||||
Quit,
|
||||
@@ -36,3 +38,30 @@ impl ShutdownSignals {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs in a loop since we want to be responsive to multiple signals
|
||||
/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
|
||||
/// <https://github.com/neondatabase/neon/issues/9740>
|
||||
pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
|
||||
let mut sigint = signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = signal(SignalKind::quit()).unwrap();
|
||||
|
||||
loop {
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => "SIGINT",
|
||||
_ = sigterm.recv() => "SIGTERM",
|
||||
};
|
||||
|
||||
if !token.is_cancelled() {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
|
||||
token.cancel();
|
||||
} else {
|
||||
info!("Got signal {signal}. Already shutting down.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
28
object_storage/Cargo.toml
Normal file
28
object_storage/Cargo.toml
Normal file
@@ -0,0 +1,28 @@
|
||||
[package]
|
||||
name = "object_storage"
|
||||
version = "0.0.1"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
axum-extra.workspace = true
|
||||
axum.workspace = true
|
||||
camino.workspace = true
|
||||
futures.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
prometheus.workspace = true
|
||||
remote_storage.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils = { path = "../libs/utils", default-features = false }
|
||||
workspace_hack.workspace = true
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
http-body-util.workspace = true
|
||||
itertools.workspace = true
|
||||
rand.workspace = true
|
||||
test-log.workspace = true
|
||||
tower.workspace = true
|
||||
561
object_storage/src/app.rs
Normal file
561
object_storage/src/app.rs
Normal file
@@ -0,0 +1,561 @@
|
||||
use anyhow::anyhow;
|
||||
use axum::body::{Body, Bytes};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Router, http::StatusCode};
|
||||
use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
|
||||
use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
use utils::backoff::retry;
|
||||
|
||||
pub fn app(state: Arc<Storage>) -> Router<()> {
|
||||
use axum::routing::{delete as _delete, get as _get};
|
||||
let delete_prefix = _delete(delete_prefix);
|
||||
Router::new()
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
|
||||
_get(get).put(set).delete(delete),
|
||||
)
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}",
|
||||
delete_prefix.clone(),
|
||||
)
|
||||
.route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
|
||||
.route("/{tenant_id}", delete_prefix)
|
||||
.route("/metrics", _get(metrics))
|
||||
.route("/status", _get(async || StatusCode::OK.into_response()))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
type Result = anyhow::Result<Response, Response>;
|
||||
type State = axum::extract::State<Arc<Storage>>;
|
||||
|
||||
const CONTENT_TYPE: &str = "content-type";
|
||||
const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
|
||||
const WARN_THRESHOLD: u32 = 3;
|
||||
const MAX_RETRIES: u32 = 10;
|
||||
|
||||
async fn metrics() -> Result {
|
||||
prometheus::TextEncoder::new()
|
||||
.encode_to_string(&prometheus::gather())
|
||||
.map(|s| s.into_response())
|
||||
.map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
|
||||
}
|
||||
|
||||
async fn get(S3Path { path }: S3Path, state: State) -> Result {
|
||||
info!(%path, "downloading");
|
||||
let download_err = |e| {
|
||||
if let DownloadError::NotFound = e {
|
||||
info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
|
||||
return not_found(&path);
|
||||
}
|
||||
internal_error(e, &path, "downloading")
|
||||
};
|
||||
let cancel = state.cancel.clone();
|
||||
let opts = &DownloadOpts::default();
|
||||
|
||||
let stream = retry(
|
||||
async || state.storage.download(&path, opts, &cancel).await,
|
||||
DownloadError::is_permanent,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"downloading",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(DownloadError::Cancelled))
|
||||
.map_err(download_err)?
|
||||
.download_stream;
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
|
||||
.body(Body::from_stream(stream))
|
||||
.map_err(|e| internal_error(e, path, "reading response"))
|
||||
}
|
||||
|
||||
// Best solution for files is multipart upload, but remote_storage doesn't support it,
|
||||
// so we can either read Bytes in memory and push at once or forward BodyDataStream to
|
||||
// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
|
||||
// guaranteed size() which may produce issues while uploading to s3.
|
||||
// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
|
||||
// very large files.
|
||||
async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
|
||||
info!(%path, "uploading");
|
||||
let request_len = bytes.len();
|
||||
let max_len = state.max_upload_file_limit;
|
||||
if request_len > max_len {
|
||||
return Err(bad_request(
|
||||
anyhow!("File size {request_len} exceeds max {max_len}"),
|
||||
"uploading",
|
||||
));
|
||||
}
|
||||
|
||||
let cancel = state.cancel.clone();
|
||||
let fun = async || {
|
||||
let stream = bytes_to_stream(bytes.clone());
|
||||
state
|
||||
.storage
|
||||
.upload(stream, request_len, &path, None, &cancel)
|
||||
.await
|
||||
};
|
||||
retry(
|
||||
fun,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"uploading",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("uploading cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "reading response"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
async fn delete(S3Path { path }: S3Path, state: State) -> Result {
|
||||
info!(%path, "deleting");
|
||||
let cancel = state.cancel.clone();
|
||||
retry(
|
||||
async || state.storage.delete(&path, &cancel).await,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"deleting",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("deleting cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "deleting"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
|
||||
info!(%path, "deleting prefix");
|
||||
let cancel = state.cancel.clone();
|
||||
retry(
|
||||
async || state.storage.delete_prefix(&path, &cancel).await,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
WARN_THRESHOLD,
|
||||
MAX_RETRIES,
|
||||
"deleting prefix",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(Err(anyhow!("deleting prefix cancelled")))
|
||||
.map_err(|e| internal_error(e, path, "deleting prefix"))?;
|
||||
Ok(ok())
|
||||
}
|
||||
|
||||
pub async fn check_storage_permissions(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("storage permissions check");
|
||||
|
||||
// as_nanos() as multiple instances proxying same bucket may be started at once
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)?
|
||||
.as_nanos()
|
||||
.to_string();
|
||||
|
||||
let path = RemotePath::from_string(&format!("write_access_{now}"))?;
|
||||
info!(%path, "uploading");
|
||||
|
||||
let body = now.to_string();
|
||||
let stream = bytes_to_stream(Bytes::from(body.clone()));
|
||||
client
|
||||
.upload(stream, body.len(), &path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
use tokio::io::AsyncReadExt;
|
||||
info!(%path, "downloading");
|
||||
let download_opts = DownloadOpts {
|
||||
kind: remote_storage::DownloadKind::Small,
|
||||
..Default::default()
|
||||
};
|
||||
let mut body_read_buf = Vec::new();
|
||||
let stream = client
|
||||
.download(&path, &download_opts, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
tokio_util::io::StreamReader::new(stream)
|
||||
.read_to_end(&mut body_read_buf)
|
||||
.await?;
|
||||
let body_read = String::from_utf8(body_read_buf)?;
|
||||
if body != body_read {
|
||||
error!(%body, %body_read, "File contents do not match");
|
||||
anyhow::bail!("Read back file doesn't match original")
|
||||
}
|
||||
|
||||
info!(%path, "removing");
|
||||
client.delete(&path, &cancel).await
|
||||
}
|
||||
|
||||
fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
|
||||
futures::stream::once(futures::future::ready(Ok(bytes)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use axum::{body::Body, extract::Request, response::Response};
|
||||
use http_body_util::BodyExt;
|
||||
use itertools::iproduct;
|
||||
use std::env::var;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use test_log::test as testlog;
|
||||
use tower::{Service, util::ServiceExt};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// see libs/remote_storage/tests/test_real_s3.rs
|
||||
const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
|
||||
const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
|
||||
|
||||
async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
|
||||
let cancel = CancellationToken::new();
|
||||
let (dir, storage) = if var(REAL_S3_ENV).is_err() {
|
||||
// tests execute in parallel and we need a new directory for each of them
|
||||
let dir = camino_tempfile::tempdir().unwrap();
|
||||
let fs =
|
||||
remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
|
||||
(Some(dir), GenericRemoteStorage::LocalFs(fs))
|
||||
} else {
|
||||
// test_real_s3::create_s3_client is hard to reference, reimplementing here
|
||||
let millis = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
use rand::Rng;
|
||||
let random = rand::thread_rng().r#gen::<u32>();
|
||||
|
||||
let s3_config = remote_storage::S3Config {
|
||||
bucket_name: var(REAL_S3_BUCKET).unwrap(),
|
||||
bucket_region: var(REAL_S3_REGION).unwrap(),
|
||||
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
|
||||
endpoint: None,
|
||||
concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: None,
|
||||
upload_storage_class: None,
|
||||
};
|
||||
let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
|
||||
.await
|
||||
.unwrap();
|
||||
(None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
|
||||
};
|
||||
|
||||
let proxy = Storage {
|
||||
auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
|
||||
storage,
|
||||
cancel: cancel.clone(),
|
||||
max_upload_file_limit: usize::MAX,
|
||||
};
|
||||
check_storage_permissions(&proxy.storage, cancel)
|
||||
.await
|
||||
.unwrap();
|
||||
(proxy, dir)
|
||||
}
|
||||
|
||||
// see libs/utils/src/auth.rs
|
||||
const TEST_PUB_KEY_ED25519: &[u8] = b"
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
|
||||
-----END PUBLIC KEY-----
|
||||
";
|
||||
|
||||
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
-----END PRIVATE KEY-----
|
||||
"#;
|
||||
|
||||
async fn request(req: Request<Body>) -> Response<Body> {
|
||||
let (proxy, _) = proxy().await;
|
||||
app(Arc::new(proxy))
|
||||
.into_service()
|
||||
.oneshot(req)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn status() {
|
||||
let res = Request::builder()
|
||||
.uri("/status")
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
|
||||
iproduct!(
|
||||
vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
|
||||
vec!["GET", "PUT", "DELETE"]
|
||||
)
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn no_token() {
|
||||
for (uri, method) in routes() {
|
||||
info!(%uri, %method);
|
||||
let res = Request::builder()
|
||||
.uri(uri)
|
||||
.method(method)
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert!(matches!(
|
||||
res.status(),
|
||||
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn invalid_token() {
|
||||
for (uri, method) in routes() {
|
||||
info!(%uri, %method);
|
||||
let status = Request::builder()
|
||||
.uri(uri)
|
||||
.header("Authorization", "Bearer 123")
|
||||
.method(method)
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert!(matches!(
|
||||
status.status(),
|
||||
StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
const TENANT_ID: TenantId =
|
||||
TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
|
||||
const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
|
||||
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
|
||||
fn token() -> String {
|
||||
let claims = object_storage::Claims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
|
||||
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
|
||||
jsonwebtoken::encode(&header, &claims, &key).unwrap()
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn unauthorized() {
|
||||
let (proxy, _) = proxy().await;
|
||||
let mut app = app(Arc::new(proxy)).into_service();
|
||||
let token = token();
|
||||
let args = itertools::iproduct!(
|
||||
vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
|
||||
vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
|
||||
vec![ENDPOINT_ID, "ep-ololo"]
|
||||
)
|
||||
.skip(1);
|
||||
|
||||
for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
|
||||
info!(%uri, %method, %tenant, %timeline, %endpoint);
|
||||
let request = Request::builder()
|
||||
.uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
|
||||
.method(method)
|
||||
.header("Authorization", format!("Bearer {}", token))
|
||||
.body(Body::empty())
|
||||
.unwrap();
|
||||
let status = ServiceExt::ready(&mut app)
|
||||
.await
|
||||
.unwrap()
|
||||
.call(request)
|
||||
.await
|
||||
.unwrap()
|
||||
.status();
|
||||
assert_eq!(status, StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn method_not_allowed() {
|
||||
let token = token();
|
||||
let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
|
||||
for (key, method) in iter {
|
||||
let status = Request::builder()
|
||||
.uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
|
||||
.method(method)
|
||||
.header("Authorization", format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await
|
||||
.status();
|
||||
assert!(matches!(
|
||||
status,
|
||||
StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
async fn requests_chain(
|
||||
chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
|
||||
token: impl Fn(&str) -> String,
|
||||
) {
|
||||
let (proxy, _) = proxy().await;
|
||||
let mut app = app(Arc::new(proxy)).into_service();
|
||||
for (uri, method, body, expected_status, compare_body) in chain {
|
||||
info!(%uri, %method, %body, %expected_status);
|
||||
let bearer = format!("Bearer {}", token(&uri));
|
||||
let request = Request::builder()
|
||||
.uri(uri)
|
||||
.method(method)
|
||||
.header("Authorization", &bearer)
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
let response = ServiceExt::ready(&mut app)
|
||||
.await
|
||||
.unwrap()
|
||||
.call(request)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(response.status(), expected_status);
|
||||
if !compare_body {
|
||||
continue;
|
||||
}
|
||||
let read_body = response.into_body().collect().await.unwrap().to_bytes();
|
||||
assert_eq!(body, read_body);
|
||||
}
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn metrics() {
|
||||
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
|
||||
let req = vec![
|
||||
(uri.clone(), "PUT", "body", StatusCode::OK, false),
|
||||
(uri.clone(), "DELETE", "", StatusCode::OK, false),
|
||||
];
|
||||
requests_chain(req.into_iter(), |_| token()).await;
|
||||
|
||||
let res = Request::builder()
|
||||
.uri("/metrics")
|
||||
.body(Body::empty())
|
||||
.map(request)
|
||||
.unwrap()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
let body = res.into_body().collect().await.unwrap().to_bytes();
|
||||
let body = String::from_utf8_lossy(&body);
|
||||
tracing::debug!(%body);
|
||||
// Storage metrics are not gathered for LocalFs
|
||||
if var(REAL_S3_ENV).is_ok() {
|
||||
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
|
||||
}
|
||||
assert!(body.contains("process_threads"));
|
||||
}
|
||||
|
||||
#[testlog(tokio::test)]
|
||||
async fn insert_retrieve_remove() {
|
||||
let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
|
||||
let chain = vec![
|
||||
(uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
|
||||
(uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
|
||||
(uri.clone(), "DELETE", "", StatusCode::OK, false),
|
||||
(uri, "GET", "", StatusCode::NOT_FOUND, false),
|
||||
];
|
||||
requests_chain(chain.into_iter(), |_| token()).await;
|
||||
}
|
||||
|
||||
fn delete_prefix_token(uri: &str) -> String {
|
||||
use serde::Serialize;
|
||||
let parts = uri.split("/").collect::<Vec<&str>>();
|
||||
#[derive(Serialize)]
|
||||
struct PrefixClaims {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
endpoint_id: Option<object_storage::EndpointId>,
|
||||
exp: u64,
|
||||
}
|
||||
let claims = PrefixClaims {
|
||||
tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
|
||||
timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
|
||||
endpoint_id: parts.get(3).map(ToString::to_string),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
|
||||
let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
|
||||
jsonwebtoken::encode(&header, &claims, &key).unwrap()
|
||||
}
|
||||
|
||||
// Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
|
||||
#[testlog(tokio::test)]
|
||||
async fn delete_prefix() {
|
||||
let tenant_id =
|
||||
TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
|
||||
let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
|
||||
// Why extra slash in string literals? Axum is weird with URIs:
|
||||
// /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
|
||||
// as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
|
||||
// The cost of removing trailing slash is suprisingly hard:
|
||||
// * Add tower dependency with NormalizePath layer
|
||||
// * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
|
||||
// * Rewrite make_service() -> into_make_service()
|
||||
// * Rewrite oneshot() (not available for NormalizePath)
|
||||
// I didn't manage to get it working correctly
|
||||
let chain = vec![
|
||||
// create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
|
||||
(f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
// create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
// create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
// create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
|
||||
(f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
|
||||
// create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
|
||||
(f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
|
||||
(f(t2, ""), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
|
||||
(f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
|
||||
// delete prefix 1 -> empty
|
||||
(format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
|
||||
(f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
(f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
|
||||
];
|
||||
requests_chain(chain.into_iter(), delete_prefix_token).await;
|
||||
}
|
||||
}
|
||||
344
object_storage/src/lib.rs
Normal file
344
object_storage/src/lib.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
use anyhow::Result;
|
||||
use axum::extract::{FromRequestParts, Path};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
|
||||
use axum_extra::TypedHeader;
|
||||
use axum_extra::headers::{Authorization, authorization::Bearer};
|
||||
use camino::Utf8PathBuf;
|
||||
use jsonwebtoken::{DecodingKey, Validation};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt::Display;
|
||||
use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// simplified version of utils::auth::JwtAuth
|
||||
pub struct JwtAuth {
|
||||
decoding_key: DecodingKey,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
|
||||
impl JwtAuth {
|
||||
pub fn new(key: &[u8]) -> Result<Self> {
|
||||
Ok(Self {
|
||||
decoding_key: DecodingKey::from_ed_pem(key)?,
|
||||
validation: Validation::new(VALIDATION_ALGO),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
|
||||
Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
|
||||
let key = clean_utf8(&Utf8PathBuf::from(key));
|
||||
if key.starts_with("..") || key == "." || key == "/" {
|
||||
return Err(format!("invalid key {key}"));
|
||||
}
|
||||
match key.strip_prefix("/").map(Utf8PathBuf::from) {
|
||||
Ok(p) => Ok(p),
|
||||
_ => Ok(key),
|
||||
}
|
||||
}
|
||||
|
||||
// Copied from path_clean crate with PathBuf->Utf8PathBuf
|
||||
fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
|
||||
use camino::Utf8Component as Comp;
|
||||
let mut out = Vec::new();
|
||||
for comp in path.components() {
|
||||
match comp {
|
||||
Comp::CurDir => (),
|
||||
Comp::ParentDir => match out.last() {
|
||||
Some(Comp::RootDir) => (),
|
||||
Some(Comp::Normal(_)) => {
|
||||
out.pop();
|
||||
}
|
||||
None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
|
||||
out.push(comp)
|
||||
}
|
||||
},
|
||||
comp => out.push(comp),
|
||||
}
|
||||
}
|
||||
if !out.is_empty() {
|
||||
out.iter().collect()
|
||||
} else {
|
||||
Utf8PathBuf::from(".")
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Storage {
|
||||
pub auth: JwtAuth,
|
||||
pub storage: GenericRemoteStorage,
|
||||
pub cancel: CancellationToken,
|
||||
pub max_upload_file_limit: usize,
|
||||
}
|
||||
|
||||
pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct Claims {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub exp: u64,
|
||||
}
|
||||
|
||||
impl Display for Claims {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
|
||||
self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct KeyRequest {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
endpoint_id: EndpointId,
|
||||
path: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct S3Path {
|
||||
pub path: RemotePath,
|
||||
}
|
||||
|
||||
impl TryFrom<&KeyRequest> for S3Path {
|
||||
type Error = String;
|
||||
fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
|
||||
let KeyRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
endpoint_id,
|
||||
path,
|
||||
} = &req;
|
||||
let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
|
||||
let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
|
||||
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
|
||||
Ok(S3Path { path })
|
||||
}
|
||||
}
|
||||
|
||||
fn unauthorized(route: impl Display, claims: impl Display) -> Response {
|
||||
debug!(%route, %claims, "route doesn't match claims");
|
||||
StatusCode::UNAUTHORIZED.into_response()
|
||||
}
|
||||
|
||||
pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
|
||||
debug!(%err, desc);
|
||||
(StatusCode::BAD_REQUEST, err.to_string()).into_response()
|
||||
}
|
||||
|
||||
pub fn ok() -> Response {
|
||||
StatusCode::OK.into_response()
|
||||
}
|
||||
|
||||
pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
|
||||
error!(%err, %path, desc);
|
||||
StatusCode::INTERNAL_SERVER_ERROR.into_response()
|
||||
}
|
||||
|
||||
pub fn not_found(key: impl ToString) -> Response {
|
||||
(StatusCode::NOT_FOUND, key.to_string()).into_response()
|
||||
}
|
||||
|
||||
impl FromRequestParts<Arc<Storage>> for S3Path {
|
||||
type Rejection = Response;
|
||||
async fn from_request_parts(
|
||||
parts: &mut Parts,
|
||||
state: &Arc<Storage>,
|
||||
) -> Result<Self, Self::Rejection> {
|
||||
let Path(path): Path<KeyRequest> = parts
|
||||
.extract()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid route"))?;
|
||||
let TypedHeader(Authorization(bearer)) = parts
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: Claims = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "decoding token"))?;
|
||||
let route = Claims {
|
||||
tenant_id: path.tenant_id,
|
||||
timeline_id: path.timeline_id,
|
||||
endpoint_id: path.endpoint_id.clone(),
|
||||
exp: claims.exp,
|
||||
};
|
||||
if route != claims {
|
||||
return Err(unauthorized(route, claims));
|
||||
}
|
||||
(&path)
|
||||
.try_into()
|
||||
.map_err(|e| bad_request(e, "invalid route"))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct PrefixKeyPath {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
}
|
||||
|
||||
impl Display for PrefixKeyPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
|
||||
self.tenant_id,
|
||||
self.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string()),
|
||||
self.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string())
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PrefixS3Path {
|
||||
pub path: RemotePath,
|
||||
}
|
||||
|
||||
impl From<&PrefixKeyPath> for PrefixS3Path {
|
||||
fn from(path: &PrefixKeyPath) -> Self {
|
||||
let timeline_id = path
|
||||
.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string());
|
||||
let endpoint_id = path
|
||||
.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string());
|
||||
let path = Utf8PathBuf::from(path.tenant_id.to_string())
|
||||
.join(timeline_id)
|
||||
.join(endpoint_id);
|
||||
let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
|
||||
PrefixS3Path { path }
|
||||
}
|
||||
}
|
||||
|
||||
impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
|
||||
type Rejection = Response;
|
||||
async fn from_request_parts(
|
||||
parts: &mut Parts,
|
||||
state: &Arc<Storage>,
|
||||
) -> Result<Self, Self::Rejection> {
|
||||
let Path(path) = parts
|
||||
.extract::<Path<PrefixKeyPath>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid route"))?;
|
||||
let TypedHeader(Authorization(bearer)) = parts
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: PrefixKeyPath = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
if path != claims {
|
||||
return Err(unauthorized(path, claims));
|
||||
}
|
||||
Ok((&path).into())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn normalize_key() {
|
||||
let f = super::normalize_key;
|
||||
assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
|
||||
assert_eq!(
|
||||
f("ololo/1/../../not_ololo").unwrap(),
|
||||
Utf8PathBuf::from("not_ololo")
|
||||
);
|
||||
assert!(f("ololo/1/../../../").is_err());
|
||||
assert!(f(".").is_err());
|
||||
assert!(f("../").is_err());
|
||||
assert!(f("").is_err());
|
||||
assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
|
||||
assert!(f("/1/2/3/../../../").is_err());
|
||||
assert!(f("/1/2/3/../../../../").is_err());
|
||||
}
|
||||
|
||||
const TENANT_ID: TenantId =
|
||||
TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
|
||||
const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
|
||||
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
|
||||
|
||||
#[test]
|
||||
fn s3_path() {
|
||||
let auth = Claims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
exp: u64::MAX,
|
||||
};
|
||||
let s3_path = |key| {
|
||||
let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
|
||||
let path = RemotePath::from_string(path).unwrap();
|
||||
S3Path { path }
|
||||
};
|
||||
|
||||
let path = "cache_key".to_string();
|
||||
let mut key_path = KeyRequest {
|
||||
path,
|
||||
tenant_id: auth.tenant_id,
|
||||
timeline_id: auth.timeline_id,
|
||||
endpoint_id: auth.endpoint_id,
|
||||
};
|
||||
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
|
||||
|
||||
key_path.path = "we/can/have/nested/paths".to_string();
|
||||
assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
|
||||
|
||||
key_path.path = "../error/hello/../".to_string();
|
||||
assert!(S3Path::try_from(&key_path).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_s3_path() {
|
||||
let mut path = PrefixKeyPath {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: None,
|
||||
endpoint_id: None,
|
||||
};
|
||||
let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}"))
|
||||
);
|
||||
|
||||
path.timeline_id = Some(TIMELINE_ID);
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
|
||||
);
|
||||
|
||||
path.endpoint_id = Some(ENDPOINT_ID.into());
|
||||
assert_eq!(
|
||||
PrefixS3Path::from(&path).path,
|
||||
prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
|
||||
);
|
||||
}
|
||||
}
|
||||
65
object_storage/src/main.rs
Normal file
65
object_storage/src/main.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
//! `object_storage` is a service which provides API for uploading and downloading
|
||||
//! files. It is used by compute and control plane for accessing LFC prewarm data.
|
||||
//! This service is deployed either as a separate component or as part of compute image
|
||||
//! for large computes.
|
||||
mod app;
|
||||
use anyhow::Context;
|
||||
use tracing::info;
|
||||
use utils::logging;
|
||||
|
||||
//see set()
|
||||
const fn max_upload_file_limit() -> usize {
|
||||
100 * 1024 * 1024
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
struct Config {
|
||||
listen: std::net::SocketAddr,
|
||||
pemfile: camino::Utf8PathBuf,
|
||||
#[serde(flatten)]
|
||||
storage_config: remote_storage::RemoteStorageConfig,
|
||||
#[serde(default = "max_upload_file_limit")]
|
||||
max_upload_file_limit: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let config: String = std::env::args().skip(1).take(1).collect();
|
||||
if config.is_empty() {
|
||||
anyhow::bail!("Usage: object_storage config.json")
|
||||
}
|
||||
info!("Reading config from {config}");
|
||||
let config = std::fs::read_to_string(config.clone())?;
|
||||
let config: Config = serde_json::from_str(&config).context("parsing config")?;
|
||||
info!("Reading pemfile from {}", config.pemfile.clone());
|
||||
let pemfile = std::fs::read(config.pemfile.clone())?;
|
||||
info!("Loading public key from {}", config.pemfile.clone());
|
||||
let auth = object_storage::JwtAuth::new(&pemfile)?;
|
||||
|
||||
let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
|
||||
info!("listening on {}", listener.local_addr().unwrap());
|
||||
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
app::check_storage_permissions(&storage, cancel.clone()).await?;
|
||||
|
||||
let proxy = std::sync::Arc::new(object_storage::Storage {
|
||||
auth,
|
||||
storage,
|
||||
cancel: cancel.clone(),
|
||||
max_upload_file_limit: config.max_upload_file_limit,
|
||||
});
|
||||
|
||||
tokio::spawn(utils::signals::signal_handler(cancel.clone()));
|
||||
axum::serve(listener, app::app(proxy))
|
||||
.with_graceful_shutdown(async move { cancel.cancelled().await })
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::config::PageServerConf;
|
||||
use pageserver::walredo::PostgresRedoManager;
|
||||
use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -223,7 +223,14 @@ impl Request {
|
||||
|
||||
// TODO: avoid these clones
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.request_redo(
|
||||
*key,
|
||||
*lsn,
|
||||
base_img.clone(),
|
||||
records.clone(),
|
||||
*pg_version,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.await
|
||||
.context("request_redo")
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -353,9 +353,10 @@ where
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.get_vectored(query, self.io_concurrency.clone(), self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
|
||||
@@ -31,7 +31,6 @@ use pageserver::{
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -453,6 +452,24 @@ fn start_pageserver(
|
||||
info!("Using auth for http API: {:#?}", conf.http_auth_type);
|
||||
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
|
||||
|
||||
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
|
||||
{
|
||||
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
Some(Arc::new(server_config))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
@@ -671,17 +688,11 @@ fn start_pageserver(
|
||||
|
||||
let https_task = match https_listener {
|
||||
Some(https_listener) => {
|
||||
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
let tls_server_config = tls_server_config
|
||||
.clone()
|
||||
.expect("tls_server_config is set earlier if https is enabled");
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
|
||||
|
||||
let server =
|
||||
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
|
||||
@@ -737,6 +748,11 @@ fn start_pageserver(
|
||||
tokio::net::TcpListener::from_std(pageserver_listener)
|
||||
.context("create tokio listener")?
|
||||
},
|
||||
if conf.enable_tls_page_service_api {
|
||||
tls_server_config
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
@@ -744,32 +760,7 @@ fn start_pageserver(
|
||||
let signal_token = CancellationToken::new();
|
||||
let signal_cancel = signal_token.child_token();
|
||||
|
||||
// Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
|
||||
// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
|
||||
// https://github.com/neondatabase/neon/issues/9740.
|
||||
tokio::spawn(async move {
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||
|
||||
loop {
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => "SIGINT",
|
||||
_ = sigterm.recv() => "SIGTERM",
|
||||
};
|
||||
|
||||
if !signal_token.is_cancelled() {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
|
||||
signal_token.cancel();
|
||||
} else {
|
||||
info!("Got signal {signal}. Already shutting down.");
|
||||
}
|
||||
}
|
||||
});
|
||||
tokio::spawn(utils::signals::signal_handler(signal_token));
|
||||
|
||||
// Wait for cancellation signal and shut down the pageserver.
|
||||
//
|
||||
|
||||
@@ -219,6 +219,11 @@ pub struct PageServerConf {
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
|
||||
pub tracing: Option<pageserver_api::config::Tracing>,
|
||||
|
||||
/// Enable TLS in page service API.
|
||||
/// Does not force TLS: the client negotiates TLS usage during the handshake.
|
||||
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -391,6 +396,7 @@ impl PageServerConf {
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -441,6 +447,7 @@ impl PageServerConf {
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
|
||||
@@ -212,6 +212,12 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"412":
|
||||
description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
|
||||
@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::index::GcCompactionState;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_index_part, list_remote_tenant_shards, list_remote_timelines,
|
||||
download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
"Lsn calculations by timestamp are only available on shard zero"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
"Timestamp calculations by lsn are only available on shard zero"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
|
||||
.to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||
None => Err(ApiError::PreconditionFailed(
|
||||
format!("Timestamp for lsn {} not found", lsn).into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -2274,6 +2274,7 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
// Manual compaction does not yield for L0.
|
||||
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
@@ -2911,9 +2912,22 @@ async fn tenant_scan_remote_handler(
|
||||
};
|
||||
}
|
||||
|
||||
let result =
|
||||
download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
|
||||
.instrument(info_span!("download_tenant_manifest",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
let stripe_size = match result {
|
||||
Ok((manifest, _, _)) => manifest.stripe_size,
|
||||
Err(DownloadError::NotFound) => None,
|
||||
Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
|
||||
};
|
||||
|
||||
response.shards.push(TenantScanRemoteStorageShard {
|
||||
tenant_shard_id,
|
||||
generation: generation.into(),
|
||||
stripe_size,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3239,7 +3253,7 @@ async fn ingest_aux_files(
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
@@ -3368,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup(
|
||||
|
||||
let broker_client = state.broker_client.clone();
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
let mut body = StreamReader::new(
|
||||
request
|
||||
.into_body()
|
||||
.map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
|
||||
);
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -3446,7 +3460,7 @@ async fn put_tenant_timeline_import_wal(
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
std::io::Error::other( anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walingest::{WalIngest, WalIngestErrorKind};
|
||||
|
||||
// Returns checkpoint LSN from controlfile
|
||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||
@@ -157,9 +157,9 @@ async fn import_rel(
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
RelationError::AlreadyExists => {
|
||||
debug!("Relation {} already exist. We must be extending it.", rel)
|
||||
match e.kind {
|
||||
WalIngestErrorKind::RelationAlreadyExists(rel) => {
|
||||
debug!("Relation {rel} already exists. We must be extending it.")
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use metrics::{
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -1863,7 +1863,7 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
|
||||
"pageserver_page_service_config_max_batch_size",
|
||||
"Configured maximum batch size for the server-side batching functionality of page_service. \
|
||||
Labels expose more of the configuration parameters.",
|
||||
&["mode", "execution"]
|
||||
&["mode", "execution", "batching"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1871,10 +1871,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
|
||||
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
|
||||
let (label_values, value) = match conf {
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching,
|
||||
}) => {
|
||||
let mode = "pipelined";
|
||||
let execution = match execution {
|
||||
@@ -1883,7 +1884,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
}
|
||||
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
|
||||
};
|
||||
([mode, execution], max_batch_size.get())
|
||||
let batching = match batching {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
|
||||
};
|
||||
|
||||
([mode, execution, batching], max_batch_size.get())
|
||||
}
|
||||
};
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
|
||||
|
||||
@@ -18,7 +18,7 @@ use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::{
|
||||
@@ -105,6 +105,7 @@ pub fn spawn(
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -124,6 +125,7 @@ pub fn spawn(
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
|
||||
let pgbackend =
|
||||
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
|
||||
|
||||
match pgbackend.run(&mut conn_handler, &cancel).await {
|
||||
Ok(()) => {
|
||||
@@ -635,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
effective_request_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
}
|
||||
|
||||
@@ -664,7 +671,6 @@ enum BatchedFeMessage {
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
},
|
||||
DbSize {
|
||||
@@ -1019,34 +1025,28 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
let res = Self::wait_or_get_last_lsn(
|
||||
let effective_request_lsn = match Self::effective_request_lsn(
|
||||
&shard,
|
||||
shard.get_last_record_lsn(),
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
let effective_request_lsn = match res {
|
||||
) {
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
effective_request_lsn,
|
||||
ctx,
|
||||
}],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -1072,6 +1072,7 @@ impl PageServerHandler {
|
||||
#[instrument(skip_all, level = tracing::Level::TRACE)]
|
||||
#[allow(clippy::boxed_local)]
|
||||
fn pagestream_do_batch(
|
||||
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
max_batch_size: NonZeroUsize,
|
||||
batch: &mut Result<BatchedFeMessage, QueryError>,
|
||||
this_msg: Result<BatchedFeMessage, QueryError>,
|
||||
@@ -1090,33 +1091,61 @@ impl PageServerHandler {
|
||||
span: _,
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
effective_request_lsn: accum_lsn,
|
||||
}),
|
||||
BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
effective_request_lsn: this_lsn,
|
||||
},
|
||||
) if (|| {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= max_batch_size.get() {
|
||||
trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
// the vectored get currently only supports a single LSN, so, bounce as soon
|
||||
// as the effective request_lsn changes
|
||||
if *accum_lsn != this_lsn {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
|
||||
return false;
|
||||
|
||||
match batching_strategy {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
|
||||
if let Some(last_in_batch) = accum_pages.last() {
|
||||
if last_in_batch.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
|
||||
// The read path doesn't curently support serving the same page at different LSNs.
|
||||
// While technically possible, it's uncertain if the complexity is worth it.
|
||||
// Break the batch if such a case is encountered.
|
||||
//
|
||||
// TODO(vlad): Include a metric for batch breaks with a reason label.
|
||||
let same_page_different_lsn = accum_pages.iter().any(|batched| {
|
||||
batched.req.rel == this_pages[0].req.rel
|
||||
&& batched.req.blkno == this_pages[0].req.blkno
|
||||
&& batched.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
});
|
||||
|
||||
if same_page_different_lsn {
|
||||
trace!(
|
||||
rel=%this_pages[0].req.rel,
|
||||
blkno=%this_pages[0].req.blkno,
|
||||
lsn=%this_pages[0].effective_request_lsn,
|
||||
"stopping batching because same page was requested at different LSNs"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
@@ -1384,12 +1413,7 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
} => {
|
||||
BatchedFeMessage::GetPage { span, shard, pages } => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
@@ -1399,7 +1423,6 @@ impl PageServerHandler {
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
io_concurrency,
|
||||
&ctx,
|
||||
@@ -1718,6 +1741,7 @@ impl PageServerHandler {
|
||||
let PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching: batching_strategy,
|
||||
} = pipelining_config;
|
||||
|
||||
// Macro to _define_ a pipeline stage.
|
||||
@@ -1769,7 +1793,7 @@ impl PageServerHandler {
|
||||
exit |= read_res.is_err();
|
||||
let could_send = batch_tx
|
||||
.send(read_res, |batch, res| {
|
||||
Self::pagestream_do_batch(max_batch_size, batch, res)
|
||||
Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
|
||||
})
|
||||
.await;
|
||||
exit |= could_send.is_err();
|
||||
@@ -1865,7 +1889,39 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let effective_request_lsn = Self::effective_request_lsn(
|
||||
timeline,
|
||||
last_record_lsn,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
latest_gc_cutoff_lsn,
|
||||
)?;
|
||||
|
||||
if effective_request_lsn > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Since we waited for 'effective_request_lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
}
|
||||
|
||||
Ok(effective_request_lsn)
|
||||
}
|
||||
|
||||
fn effective_request_lsn(
|
||||
timeline: &Timeline,
|
||||
last_record_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
// Sanity check the request
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
@@ -1900,19 +1956,7 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
if not_modified_since > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// Since we waited for 'not_modified_since' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
Ok(not_modified_since)
|
||||
} else {
|
||||
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
|
||||
@@ -2067,7 +2111,6 @@ impl PageServerHandler {
|
||||
async fn handle_get_page_at_lsn_request_batched(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
@@ -2086,20 +2129,81 @@ impl PageServerHandler {
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
effective_lsn: batch.effective_request_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If any request in the batch needs to wait for LSN, then do so now.
|
||||
let mut perf_instrument = false;
|
||||
let max_effective_lsn = requests
|
||||
.iter()
|
||||
.map(|req| {
|
||||
if req.ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
req.effective_request_lsn
|
||||
})
|
||||
.max()
|
||||
.expect("batch is never empty");
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id = %timeline.timeline_id,
|
||||
shard = %timeline.tenant_shard_id.shard_slug(),
|
||||
%max_effective_lsn
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if max_effective_lsn > last_record_lsn {
|
||||
if let Err(e) = timeline
|
||||
.wait_lsn(
|
||||
max_effective_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await
|
||||
{
|
||||
return Vec::from_iter(requests.into_iter().map(|req| {
|
||||
Err(BatchedPageStreamError {
|
||||
err: PageStreamError::from(e.clone()),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests
|
||||
.iter()
|
||||
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
|
||||
effective_lsn,
|
||||
requests.iter().map(|p| {
|
||||
(
|
||||
&p.req.rel,
|
||||
&p.req.blkno,
|
||||
p.effective_request_lsn,
|
||||
p.ctx.attached_child(),
|
||||
)
|
||||
}),
|
||||
io_concurrency,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(results.len(), requests.len());
|
||||
|
||||
@@ -6,14 +6,14 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
|
||||
use std::collections::{HashMap, HashSet, hash_map};
|
||||
use std::ops::{ControlFlow, Range};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, ensure};
|
||||
use crate::walingest::{WalIngestError, WalIngestErrorKind};
|
||||
use crate::{PERF_TRACE_TARGET, ensure_walingest};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
|
||||
TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
|
||||
@@ -21,7 +21,7 @@ use pageserver_api::key::{
|
||||
repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
@@ -50,7 +50,7 @@ use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -136,12 +136,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
AlreadyExists,
|
||||
#[error("invalid relnode")]
|
||||
InvalidRelnode,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
@@ -210,10 +206,9 @@ impl Timeline {
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages
|
||||
.iter()
|
||||
.map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
|
||||
effective_lsn,
|
||||
pages.iter().map(|(tag, blknum)| {
|
||||
(tag, blknum, effective_lsn, ctx.attached_child())
|
||||
}),
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
@@ -251,8 +246,7 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
|
||||
effective_lsn: Lsn,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
@@ -265,11 +259,13 @@ impl Timeline {
|
||||
let mut result = Vec::with_capacity(pages.len());
|
||||
let result_slots = result.spare_capacity_mut();
|
||||
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
BTreeMap::default();
|
||||
let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
let mut perf_instrument = false;
|
||||
for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
|
||||
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -280,14 +276,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let nblocks = match self
|
||||
.get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
|
||||
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"GET_REL_SIZE",
|
||||
reltag=%tag,
|
||||
lsn=%effective_lsn,
|
||||
lsn=%lsn,
|
||||
)
|
||||
})
|
||||
.await
|
||||
@@ -303,7 +299,7 @@ impl Timeline {
|
||||
if *blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, effective_lsn, nblocks
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
|
||||
slots_filled += 1;
|
||||
@@ -312,46 +308,29 @@ impl Timeline {
|
||||
|
||||
let key = rel_block_to_key(*tag, *blknum);
|
||||
|
||||
if ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
let key_slots = keys_slots.entry(key).or_default();
|
||||
key_slots.push((response_slot_idx, ctx));
|
||||
|
||||
let acc = req_keyspaces.entry(lsn).or_default();
|
||||
acc.add_key(key);
|
||||
}
|
||||
|
||||
let keyspace = {
|
||||
// add_key requires monotonicity
|
||||
let mut acc = KeySpaceAccum::new();
|
||||
for key in keys_slots
|
||||
.keys()
|
||||
// in fact it requires strong monotonicity
|
||||
.dedup()
|
||||
{
|
||||
acc.add_key(*key);
|
||||
}
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %self.tenant_shard_id.tenant_id,
|
||||
timeline_id = %self.timeline_id,
|
||||
lsn = %effective_lsn,
|
||||
shard = %self.tenant_shard_id.shard_slug(),
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
let query: Vec<(Lsn, KeySpace)> = req_keyspaces
|
||||
.into_iter()
|
||||
.map(|(lsn, acc)| (lsn, acc.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
let query = VersionedKeySpaceQuery::scattered(query);
|
||||
let res = self
|
||||
.get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
|
||||
.get_vectored(query, io_concurrency, ctx)
|
||||
.maybe_perf_instrument(ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"GET_BATCH",
|
||||
batch_size = %page_count,
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
match res {
|
||||
@@ -381,12 +360,12 @@ impl Timeline {
|
||||
// There is no standardized way to express that the batched span followed from N request spans.
|
||||
// So, abuse the system and mark the request contexts as follows_from the batch span, so we get
|
||||
// some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
|
||||
result_slots[first_slot].write(res);
|
||||
first_req_ctx.perf_follows_from(&ctx);
|
||||
first_req_ctx.perf_follows_from(ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
}
|
||||
@@ -425,7 +404,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
result_slots[*slot].write(err);
|
||||
}
|
||||
|
||||
@@ -664,8 +643,9 @@ impl Timeline {
|
||||
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for batch in batches.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, lsn);
|
||||
let blocks = self
|
||||
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, block) in blocks {
|
||||
@@ -691,7 +671,7 @@ impl Timeline {
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
/// Does the slru segment exist?
|
||||
pub(crate) async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
@@ -844,9 +824,9 @@ impl Timeline {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
/// Obtain the timestamp for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
/// If the lsn has no timestamps (e.g. no commits), returns None.
|
||||
pub(crate) async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
@@ -902,8 +882,9 @@ impl Timeline {
|
||||
);
|
||||
|
||||
for batch in batches.parts.into_iter().rev() {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
|
||||
let blocks = self
|
||||
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, clog_page) in blocks.into_iter().rev() {
|
||||
@@ -1478,8 +1459,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
@@ -1578,7 +1559,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u32, PageReconstructError> {
|
||||
) -> Result<u32, WalIngestError> {
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
@@ -1593,14 +1574,13 @@ impl DatadirModification<'_> {
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
self.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(0)
|
||||
} else {
|
||||
self.tline
|
||||
Ok(self
|
||||
.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1637,11 +1617,14 @@ impl DatadirModification<'_> {
|
||||
// TODO(vlad): remove this argument and replace the shard check with is_key_local
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
|
||||
let key = Key::from_compact(meta.key());
|
||||
let (rel, blkno) = key
|
||||
.to_rel_block()
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
|
||||
let new_nblocks = blkno + 1;
|
||||
|
||||
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
|
||||
@@ -1683,8 +1666,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1696,7 +1679,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.tline.tenant_shard_id.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1714,14 +1697,11 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1733,15 +1713,12 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(key, Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1751,15 +1728,11 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1776,15 +1749,11 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1832,8 +1801,10 @@ impl DatadirModification<'_> {
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
@@ -1874,13 +1845,13 @@ impl DatadirModification<'_> {
|
||||
xid: u64,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Add it to the directory entry
|
||||
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1891,7 +1862,7 @@ impl DatadirModification<'_> {
|
||||
let xid = xid as u32;
|
||||
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1909,22 +1880,22 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
origin_id: RepOriginId,
|
||||
origin_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = repl_origin_key(origin_id);
|
||||
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
|
||||
self.set_replorigin(origin_id, Lsn::INVALID).await
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CHECKPOINT_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1934,7 +1905,7 @@ impl DatadirModification<'_> {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
|
||||
@@ -1973,20 +1944,21 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RelationError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)))?;
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
|
||||
let dbdir_exists =
|
||||
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
|
||||
// Didn't exist. Update dbdir
|
||||
e.insert(false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::Db,
|
||||
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
|
||||
@@ -2003,27 +1975,25 @@ impl DatadirModification<'_> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self
|
||||
.sparse_get(sparse_rel_dir_key, ctx)
|
||||
.await
|
||||
.map_err(|e| RelationError::Other(e.into()))?;
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
if val == RelDirExists::Exists {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
@@ -2039,9 +2009,7 @@ impl DatadirModification<'_> {
|
||||
// will be key not found errors if we don't create an empty one for rel_size_v2.
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
|
||||
);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
@@ -2049,7 +2017,7 @@ impl DatadirModification<'_> {
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
@@ -2059,9 +2027,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&rel_dir).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2086,8 +2052,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
@@ -2117,8 +2083,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -2142,8 +2108,10 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2163,7 +2131,7 @@ impl DatadirModification<'_> {
|
||||
let key =
|
||||
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
if val == RelDirExists::Exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
@@ -2206,7 +2174,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Add it to the directory entry
|
||||
@@ -2215,7 +2183,7 @@ impl DatadirModification<'_> {
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::SlruSegment(kind),
|
||||
@@ -2242,7 +2210,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Put size
|
||||
@@ -2258,7 +2226,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2283,7 +2251,7 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Drop a relmapper file (pg_filenode.map)
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
|
||||
// TODO
|
||||
Ok(())
|
||||
}
|
||||
@@ -2293,7 +2261,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
xid: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
@@ -2308,7 +2276,8 @@ impl DatadirModification<'_> {
|
||||
));
|
||||
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
|
||||
} else {
|
||||
let xid: u32 = u32::try_from(xid)?;
|
||||
let xid: u32 = u32::try_from(xid)
|
||||
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -2333,7 +2302,7 @@ impl DatadirModification<'_> {
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
@@ -2342,7 +2311,7 @@ impl DatadirModification<'_> {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -2387,7 +2356,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
let new_val = aux_file::encode_file_value(&new_files)
|
||||
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -45,6 +45,7 @@ use remote_timeline_client::manifest::{
|
||||
};
|
||||
use remote_timeline_client::{
|
||||
FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError,
|
||||
download_tenant_manifest,
|
||||
};
|
||||
use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -99,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walingest::WalLagCooldown;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::walredo::{PostgresRedoManager, RedoAttemptType};
|
||||
use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};
|
||||
|
||||
static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
|
||||
@@ -226,7 +227,8 @@ struct TimelinePreload {
|
||||
}
|
||||
|
||||
pub(crate) struct TenantPreload {
|
||||
tenant_manifest: TenantManifest,
|
||||
/// The tenant manifest from remote storage, or None if no manifest was found.
|
||||
tenant_manifest: Option<TenantManifest>,
|
||||
/// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
|
||||
timelines: HashMap<TimelineId, Option<TimelinePreload>>,
|
||||
}
|
||||
@@ -282,12 +284,15 @@ pub struct Tenant {
|
||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
||||
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||
|
||||
/// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations
|
||||
/// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
|
||||
/// each other (this could be optimized to coalesce writes if necessary).
|
||||
/// The last tenant manifest known to be in remote storage. None if the manifest has not yet
|
||||
/// been either downloaded or uploaded. Always Some after tenant attach.
|
||||
///
|
||||
/// The contents of the Mutex are the last manifest we successfully uploaded
|
||||
tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
|
||||
/// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`.
|
||||
///
|
||||
/// Do not modify this directly. It is used to check whether a new manifest needs to be
|
||||
/// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via
|
||||
/// `maybe_upload_tenant_manifest`.
|
||||
remote_tenant_manifest: tokio::sync::Mutex<Option<TenantManifest>>,
|
||||
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
@@ -468,15 +473,16 @@ impl WalRedoManager {
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(_, mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
|
||||
.await
|
||||
}
|
||||
#[cfg(test)]
|
||||
Self::Test(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -915,6 +921,7 @@ enum StartCreatingTimelineResult {
|
||||
Idempotent(Arc<Timeline>),
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum TimelineInitAndSyncResult {
|
||||
ReadyToActivate(Arc<Timeline>),
|
||||
NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
|
||||
@@ -1001,6 +1008,7 @@ enum CreateTimelineCause {
|
||||
Delete,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum LoadTimelineCause {
|
||||
Attach,
|
||||
Unoffload,
|
||||
@@ -1354,36 +1362,41 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||
enum BrokenVerbosity {
|
||||
Error,
|
||||
Info
|
||||
}
|
||||
let make_broken =
|
||||
|t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
|
||||
match verbosity {
|
||||
BrokenVerbosity::Info => {
|
||||
info!("attach cancelled, setting tenant state to Broken: {err}");
|
||||
},
|
||||
BrokenVerbosity::Error => {
|
||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||
}
|
||||
fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
|
||||
t.state.send_modify(|state| match state {
|
||||
// TODO: the old code alluded to DeleteTenantFlow sometimes setting
|
||||
// TenantState::Stopping before we get here, but this may be outdated.
|
||||
// Let's find out with a testing assertion. If this doesn't fire, and the
|
||||
// logs don't show this happening in production, remove the Stopping cases.
|
||||
TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => {
|
||||
panic!("unexpected TenantState::Stopping during attach")
|
||||
}
|
||||
t.state.send_modify(|state| {
|
||||
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
||||
// if it errors, we will call make_broken when tenant is already in Stopping.
|
||||
assert!(
|
||||
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
|
||||
"the attach task owns the tenant state until activation is complete"
|
||||
);
|
||||
|
||||
*state = TenantState::broken_from_reason(err.to_string());
|
||||
});
|
||||
};
|
||||
// If the tenant is cancelled, assume the error was caused by cancellation.
|
||||
TenantState::Attaching if t.cancel.is_cancelled() => {
|
||||
info!("attach cancelled, setting tenant state to Stopping: {err}");
|
||||
// NB: progress None tells `set_stopping` that attach has cancelled.
|
||||
*state = TenantState::Stopping { progress: None };
|
||||
}
|
||||
// According to the old code, DeleteTenantFlow may already have set this to
|
||||
// Stopping. Retain its progress.
|
||||
// TODO: there is no DeleteTenantFlow. Is this still needed? See above.
|
||||
TenantState::Stopping { progress } if t.cancel.is_cancelled() => {
|
||||
assert!(progress.is_some(), "concurrent attach cancellation");
|
||||
info!("attach cancelled, already Stopping: {err}");
|
||||
}
|
||||
// Mark the tenant as broken.
|
||||
TenantState::Attaching | TenantState::Stopping { .. } => {
|
||||
error!("attach failed, setting tenant state to Broken (was {state}): {err:?}");
|
||||
*state = TenantState::broken_from_reason(err.to_string())
|
||||
}
|
||||
// The attach task owns the tenant state until activated.
|
||||
state => panic!("invalid tenant state {state} during attach: {err:?}"),
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: should also be rejecting tenant conf changes that violate this check.
|
||||
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -1435,10 +1448,8 @@ impl Tenant {
|
||||
// stayed in Activating for such a long time that shutdown found it in
|
||||
// that state.
|
||||
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
|
||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||
// just shutting down), but ensures progress.
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
|
||||
// Set the tenant to Stopping to signal `set_stopping` that we're done.
|
||||
make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
@@ -1457,7 +1468,7 @@ impl Tenant {
|
||||
match res {
|
||||
Ok(p) => Some(p),
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -1483,9 +1494,7 @@ impl Tenant {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
}
|
||||
Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)),
|
||||
}
|
||||
|
||||
// If we are doing an opportunistic warmup attachment at startup, initialize
|
||||
@@ -1525,28 +1534,27 @@ impl Tenant {
|
||||
cancel.clone(),
|
||||
)
|
||||
.await?;
|
||||
let (offloaded_add, tenant_manifest) =
|
||||
match remote_timeline_client::download_tenant_manifest(
|
||||
remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((tenant_manifest, _generation, _manifest_mtime)) => (
|
||||
format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
|
||||
tenant_manifest,
|
||||
),
|
||||
Err(DownloadError::NotFound) => {
|
||||
("no manifest".to_string(), TenantManifest::empty())
|
||||
}
|
||||
Err(e) => Err(e)?,
|
||||
};
|
||||
|
||||
let tenant_manifest = match download_tenant_manifest(
|
||||
remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((tenant_manifest, _, _)) => Some(tenant_manifest),
|
||||
Err(DownloadError::NotFound) => None,
|
||||
Err(err) => return Err(err.into()),
|
||||
};
|
||||
|
||||
info!(
|
||||
"found {} timelines, and {offloaded_add}",
|
||||
remote_timeline_ids.len()
|
||||
"found {} timelines ({} offloaded timelines)",
|
||||
remote_timeline_ids.len(),
|
||||
tenant_manifest
|
||||
.as_ref()
|
||||
.map(|m| m.offloaded_timelines.len())
|
||||
.unwrap_or(0)
|
||||
);
|
||||
|
||||
for k in other_keys {
|
||||
@@ -1555,11 +1563,13 @@ impl Tenant {
|
||||
|
||||
// Avoid downloading IndexPart of offloaded timelines.
|
||||
let mut offloaded_with_prefix = HashSet::new();
|
||||
for offloaded in tenant_manifest.offloaded_timelines.iter() {
|
||||
if remote_timeline_ids.remove(&offloaded.timeline_id) {
|
||||
offloaded_with_prefix.insert(offloaded.timeline_id);
|
||||
} else {
|
||||
// We'll take care later of timelines in the manifest without a prefix
|
||||
if let Some(tenant_manifest) = &tenant_manifest {
|
||||
for offloaded in tenant_manifest.offloaded_timelines.iter() {
|
||||
if remote_timeline_ids.remove(&offloaded.timeline_id) {
|
||||
offloaded_with_prefix.insert(offloaded.timeline_id);
|
||||
} else {
|
||||
// We'll take care later of timelines in the manifest without a prefix
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1633,12 +1643,14 @@ impl Tenant {
|
||||
|
||||
let mut offloaded_timeline_ids = HashSet::new();
|
||||
let mut offloaded_timelines_list = Vec::new();
|
||||
for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
|
||||
let timeline_id = timeline_manifest.timeline_id;
|
||||
let offloaded_timeline =
|
||||
OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
|
||||
offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
|
||||
offloaded_timeline_ids.insert(timeline_id);
|
||||
if let Some(tenant_manifest) = &preload.tenant_manifest {
|
||||
for timeline_manifest in tenant_manifest.offloaded_timelines.iter() {
|
||||
let timeline_id = timeline_manifest.timeline_id;
|
||||
let offloaded_timeline =
|
||||
OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
|
||||
offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
|
||||
offloaded_timeline_ids.insert(timeline_id);
|
||||
}
|
||||
}
|
||||
// Complete deletions for offloaded timeline id's from manifest.
|
||||
// The manifest will be uploaded later in this function.
|
||||
@@ -1796,15 +1808,21 @@ impl Tenant {
|
||||
.context("resume_deletion")
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
|
||||
}
|
||||
let needs_manifest_upload =
|
||||
offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
|
||||
{
|
||||
let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
|
||||
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
}
|
||||
if needs_manifest_upload {
|
||||
self.store_tenant_manifest().await?;
|
||||
|
||||
// Stash the preloaded tenant manifest, and upload a new manifest if changed.
|
||||
//
|
||||
// NB: this must happen after the tenant is fully populated above. In particular the
|
||||
// offloaded timelines, which are included in the manifest.
|
||||
{
|
||||
let mut guard = self.remote_tenant_manifest.lock().await;
|
||||
assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here
|
||||
*guard = preload.tenant_manifest;
|
||||
}
|
||||
self.maybe_upload_tenant_manifest().await?;
|
||||
|
||||
// The local filesystem contents are a cache of what's in the remote IndexPart;
|
||||
// IndexPart is the source of truth.
|
||||
@@ -2218,7 +2236,7 @@ impl Tenant {
|
||||
};
|
||||
|
||||
// Upload new list of offloaded timelines to S3
|
||||
self.store_tenant_manifest().await?;
|
||||
self.maybe_upload_tenant_manifest().await?;
|
||||
|
||||
// Activate the timeline (if it makes sense)
|
||||
if !(timeline.is_broken() || timeline.is_stopping()) {
|
||||
@@ -3429,7 +3447,7 @@ impl Tenant {
|
||||
shutdown_mode
|
||||
};
|
||||
|
||||
match self.set_stopping(shutdown_progress, false, false).await {
|
||||
match self.set_stopping(shutdown_progress).await {
|
||||
Ok(()) => {}
|
||||
Err(SetStoppingError::Broken) => {
|
||||
// assume that this is acceptable
|
||||
@@ -3509,25 +3527,13 @@ impl Tenant {
|
||||
/// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
|
||||
///
|
||||
/// This function is not cancel-safe!
|
||||
///
|
||||
/// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
|
||||
/// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
_allow_transition_from_loading: bool,
|
||||
allow_transition_from_attaching: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
|
||||
// cannot stop before we're done activating, so wait out until we're done activating
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Attaching if allow_transition_from_attaching => true,
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
);
|
||||
info!("waiting for {state} to turn Active|Broken|Stopping");
|
||||
false
|
||||
}
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
@@ -3538,25 +3544,24 @@ impl Tenant {
|
||||
// we now know we're done activating, let's see whether this task is the winner to transition into Stopping
|
||||
let mut err = None;
|
||||
let stopping = self.state.send_if_modified(|current_state| match current_state {
|
||||
TenantState::Activating(_) => {
|
||||
unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Attaching => {
|
||||
if !allow_transition_from_attaching {
|
||||
unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
// won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
*current_state = TenantState::Stopping { progress: Some(progress) };
|
||||
// Continue stopping outside the closure. We need to grab timelines.lock()
|
||||
// and we plan to turn it into a tokio::sync::Mutex in a future patch.
|
||||
true
|
||||
}
|
||||
TenantState::Stopping { progress: None } => {
|
||||
// An attach was cancelled, and the attach transitioned the tenant from Attaching to
|
||||
// Stopping(None) to let us know it exited. Register our progress and continue.
|
||||
*current_state = TenantState::Stopping { progress: Some(progress) };
|
||||
true
|
||||
}
|
||||
TenantState::Broken { reason, .. } => {
|
||||
info!(
|
||||
"Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
|
||||
@@ -3564,7 +3569,7 @@ impl Tenant {
|
||||
err = Some(SetStoppingError::Broken);
|
||||
false
|
||||
}
|
||||
TenantState::Stopping { progress } => {
|
||||
TenantState::Stopping { progress: Some(progress) } => {
|
||||
info!("Tenant is already in Stopping state");
|
||||
err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
|
||||
false
|
||||
@@ -4065,18 +4070,20 @@ impl Tenant {
|
||||
|
||||
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
|
||||
fn build_tenant_manifest(&self) -> TenantManifest {
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let mut timeline_manifests = timelines_offloaded
|
||||
.iter()
|
||||
.map(|(_timeline_id, offloaded)| offloaded.manifest())
|
||||
.collect::<Vec<_>>();
|
||||
// Sort the manifests so that our output is deterministic
|
||||
timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);
|
||||
// Collect the offloaded timelines, and sort them for deterministic output.
|
||||
let offloaded_timelines = self
|
||||
.timelines_offloaded
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.map(|tli| tli.manifest())
|
||||
.sorted_by_key(|m| m.timeline_id)
|
||||
.collect_vec();
|
||||
|
||||
TenantManifest {
|
||||
version: LATEST_TENANT_MANIFEST_VERSION,
|
||||
offloaded_timelines: timeline_manifests,
|
||||
stripe_size: Some(self.get_shard_stripe_size()),
|
||||
offloaded_timelines,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4299,7 +4306,7 @@ impl Tenant {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
timelines_creating: Mutex::new(HashSet::new()),
|
||||
timelines_offloaded: Mutex::new(HashMap::new()),
|
||||
tenant_manifest_upload: Default::default(),
|
||||
remote_tenant_manifest: Default::default(),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
@@ -4395,10 +4402,7 @@ impl Tenant {
|
||||
.to_string();
|
||||
|
||||
fail::fail_point!("tenant-config-before-write", |_| {
|
||||
Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"tenant-config-before-write",
|
||||
))
|
||||
Err(std::io::Error::other("tenant-config-before-write"))
|
||||
});
|
||||
|
||||
// Convert the config to a toml file.
|
||||
@@ -5532,27 +5536,35 @@ impl Tenant {
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Serialize and write the latest TenantManifest to remote storage.
|
||||
pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
|
||||
// Only one manifest write may be done at at time, and the contents of the manifest
|
||||
// must be loaded while holding this lock. This makes it safe to call this function
|
||||
// from anywhere without worrying about colliding updates.
|
||||
/// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
|
||||
/// manifest in `Self::remote_tenant_manifest`.
|
||||
///
|
||||
/// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
|
||||
/// changing any `Tenant` state that's included in the manifest, consider making the manifest
|
||||
/// the authoritative source of data with an API that automatically uploads on changes. Revisit
|
||||
/// this when the manifest is more widely used and we have a better idea of the data model.
|
||||
pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
|
||||
// Multiple tasks may call this function concurrently after mutating the Tenant runtime
|
||||
// state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
|
||||
// to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
|
||||
// simple coalescing mechanism.
|
||||
let mut guard = tokio::select! {
|
||||
g = self.tenant_manifest_upload.lock() => {
|
||||
g
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(TenantManifestError::Cancelled);
|
||||
}
|
||||
guard = self.remote_tenant_manifest.lock() => guard,
|
||||
_ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled),
|
||||
};
|
||||
|
||||
// Build a new manifest.
|
||||
let manifest = self.build_tenant_manifest();
|
||||
if Some(&manifest) == (*guard).as_ref() {
|
||||
// Optimisation: skip uploads that don't change anything.
|
||||
return Ok(());
|
||||
|
||||
// Check if the manifest has changed. We ignore the version number here, to avoid
|
||||
// uploading every manifest on version number bumps.
|
||||
if let Some(old) = guard.as_ref() {
|
||||
if manifest.eq_ignoring_version(old) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Remote storage does no retries internally, so wrap it
|
||||
// Upload the manifest. Remote storage does no retries internally, so retry here.
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
upload_tenant_manifest(
|
||||
@@ -5564,7 +5576,7 @@ impl Tenant {
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| self.cancel.is_cancelled(),
|
||||
|_| self.cancel.is_cancelled(),
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"uploading tenant manifest",
|
||||
@@ -5868,6 +5880,7 @@ pub(crate) mod harness {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
_redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, walredo::Error> {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
@@ -5935,7 +5948,7 @@ mod tests {
|
||||
use timeline::InMemoryLayerTestDesc;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use super::*;
|
||||
@@ -6773,10 +6786,11 @@ mod tests {
|
||||
for read in reads {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn);
|
||||
|
||||
let vectored_res = tline
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
reads_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -6855,10 +6869,11 @@ mod tests {
|
||||
};
|
||||
let read_lsn = child_timeline.get_last_record_lsn();
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn);
|
||||
|
||||
let vectored_res = child_timeline
|
||||
.get_vectored_impl(
|
||||
aux_keyspace.clone(),
|
||||
read_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7004,10 +7019,12 @@ mod tests {
|
||||
let read = KeySpace {
|
||||
ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
|
||||
};
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn);
|
||||
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
current_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7138,12 +7155,16 @@ mod tests {
|
||||
}
|
||||
|
||||
for query_lsn in query_lsns {
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
);
|
||||
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(
|
||||
KeySpace {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7642,10 +7663,11 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut cnt = 0;
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
|
||||
|
||||
for (key, value) in tline
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7852,8 +7874,9 @@ mod tests {
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
|
||||
let res = tline
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
|
||||
}
|
||||
@@ -8150,13 +8173,10 @@ mod tests {
|
||||
|
||||
// test vectored scan on parent timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
|
||||
let res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -8176,13 +8196,10 @@ mod tests {
|
||||
|
||||
// test vectored scan on child timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
|
||||
let res = child
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -8216,13 +8233,9 @@ mod tests {
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
@@ -8722,6 +8735,21 @@ mod tests {
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("i")),
|
||||
),
|
||||
(
|
||||
get_key(4),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init("1")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")),
|
||||
),
|
||||
];
|
||||
let image1 = vec![(get_key(1), "0x10".into())];
|
||||
|
||||
@@ -8752,8 +8780,18 @@ mod tests {
|
||||
|
||||
// Need to remove the limit of "Neon WAL redo requires base image".
|
||||
|
||||
// assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
// assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
assert_eq!(
|
||||
tline.get(get_key(3), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"c")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(get_key(4), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"ij")
|
||||
);
|
||||
|
||||
// Manual testing required: currently, read errors will panic the process in debug mode. So we
|
||||
// cannot enable this assertion in the unit test.
|
||||
// assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -9219,6 +9257,7 @@ mod tests {
|
||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -9343,7 +9382,15 @@ mod tests {
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
@@ -9422,6 +9469,7 @@ mod tests {
|
||||
&[],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -9470,6 +9518,7 @@ mod tests {
|
||||
&[Lsn(0x30)],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -10320,14 +10369,13 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace::single(get_key(0)..get_key(10)),
|
||||
delta_layer_end_lsn,
|
||||
);
|
||||
|
||||
let results = tline
|
||||
.get_vectored(
|
||||
keyspace,
|
||||
delta_layer_end_lsn,
|
||||
IoConcurrency::sequential(),
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored(query, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
@@ -10475,9 +10523,13 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace::single(get_key(0)..get_key(10)),
|
||||
last_record_lsn,
|
||||
);
|
||||
|
||||
let results = tline
|
||||
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
|
||||
.get_vectored(query, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
@@ -11533,6 +11585,99 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x24),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
// This record will fail to redo
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
delta1,
|
||||
)], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Compaction will fail, but should not fire any critical error.
|
||||
// Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
|
||||
// process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
|
||||
// compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
|
||||
let res = tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: None,
|
||||
compact_lsn_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
assert!(res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::Error;
|
||||
|
||||
use async_compression::Level;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
@@ -331,10 +331,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
return (
|
||||
(
|
||||
io_buf.slice_len(),
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
Err(Error::other(format!("blob too large ({len} bytes)"))),
|
||||
),
|
||||
srcbuf,
|
||||
);
|
||||
|
||||
@@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
.map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
|
||||
@@ -1,21 +1,33 @@
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Tenant-shard scoped manifest
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
|
||||
/// shard-wide information that must be persisted in remote storage.
|
||||
///
|
||||
/// The manifest is always updated on tenant attach, and as needed.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct TenantManifest {
|
||||
/// Debugging aid describing the version of this manifest.
|
||||
/// Can also be used for distinguishing breaking changes later on.
|
||||
/// The manifest version. Incremented on manifest format changes, even non-breaking ones.
|
||||
/// Manifests must generally always be backwards and forwards compatible for one release, to
|
||||
/// allow release rollbacks.
|
||||
pub version: usize,
|
||||
|
||||
/// This tenant's stripe size. This is only advisory, and used to recover tenant data from
|
||||
/// remote storage. The autoritative source is the storage controller. If None, assume the
|
||||
/// original default value of 32768 blocks (256 MB).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
/// The list of offloaded timelines together with enough information
|
||||
/// to not have to actually load them.
|
||||
///
|
||||
/// Note: the timelines mentioned in this list might be deleted, i.e.
|
||||
/// we don't hold an invariant that the references aren't dangling.
|
||||
/// Existence of index-part.json is the actual indicator of timeline existence.
|
||||
#[serde(default)]
|
||||
pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
|
||||
}
|
||||
|
||||
@@ -24,7 +36,7 @@ pub struct TenantManifest {
|
||||
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
|
||||
/// but the two datastructures serve different needs, this is for a persistent disk format
|
||||
/// that must be backwards compatible, while the other is only for informative purposes.
|
||||
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)]
|
||||
pub struct OffloadedTimelineManifest {
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
@@ -35,20 +47,166 @@ pub struct OffloadedTimelineManifest {
|
||||
pub archived_at: NaiveDateTime,
|
||||
}
|
||||
|
||||
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
|
||||
/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
|
||||
/// do not use deny_unknown_fields, so new fields are not breaking.
|
||||
///
|
||||
/// 1: initial version
|
||||
/// 2: +stripe_size
|
||||
///
|
||||
/// When adding new versions, also add a parse_vX test case below.
|
||||
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
|
||||
|
||||
impl TenantManifest {
|
||||
pub(crate) fn empty() -> Self {
|
||||
Self {
|
||||
version: LATEST_TENANT_MANIFEST_VERSION,
|
||||
offloaded_timelines: vec![],
|
||||
/// Returns true if the manifests are equal, ignoring the version number. This avoids
|
||||
/// re-uploading all manifests just because the version number is bumped.
|
||||
pub fn eq_ignoring_version(&self, other: &Self) -> bool {
|
||||
// Fast path: if the version is equal, just compare directly.
|
||||
if self.version == other.version {
|
||||
return self == other;
|
||||
}
|
||||
}
|
||||
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_slice::<Self>(bytes)
|
||||
|
||||
// We could alternatively just clone and modify the version here.
|
||||
let Self {
|
||||
version: _, // ignore version
|
||||
stripe_size,
|
||||
offloaded_timelines,
|
||||
} = self;
|
||||
|
||||
stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
|
||||
}
|
||||
|
||||
pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
/// Decodes a manifest from JSON.
|
||||
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_slice(bytes)
|
||||
}
|
||||
|
||||
/// Encodes a manifest as JSON.
|
||||
pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
serde_json::to_vec(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Empty manifests should be parsed. Version is required.
|
||||
#[test]
|
||||
fn parse_empty() -> anyhow::Result<()> {
|
||||
let json = r#"{
|
||||
"version": 0
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 0,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: Vec::new(),
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unknown fields should be ignored, for forwards compatibility.
|
||||
#[test]
|
||||
fn parse_unknown_fields() -> anyhow::Result<()> {
|
||||
let json = r#"{
|
||||
"version": 1,
|
||||
"foo": "bar"
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 1,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: Vec::new(),
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// v1 manifests should be parsed, for backwards compatibility.
|
||||
#[test]
|
||||
fn parse_v1() -> anyhow::Result<()> {
|
||||
let json = r#"{
|
||||
"version": 1,
|
||||
"offloaded_timelines": [
|
||||
{
|
||||
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"archived_at": "2025-03-07T11:07:11.373105434"
|
||||
},
|
||||
{
|
||||
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
|
||||
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"ancestor_retain_lsn": "0/1F79038",
|
||||
"archived_at": "2025-03-05T11:10:22.257901390"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 1,
|
||||
stripe_size: None,
|
||||
offloaded_timelines: vec![
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_retain_lsn: None,
|
||||
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
|
||||
},
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
|
||||
ancestor_timeline_id: Some(TimelineId::from_str(
|
||||
"5c4df612fd159e63c1b7853fe94d97da",
|
||||
)?),
|
||||
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
|
||||
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
|
||||
},
|
||||
],
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// v2 manifests should be parsed, for backwards compatibility.
|
||||
#[test]
|
||||
fn parse_v2() -> anyhow::Result<()> {
|
||||
let json = r#"{
|
||||
"version": 2,
|
||||
"stripe_size": 32768,
|
||||
"offloaded_timelines": [
|
||||
{
|
||||
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"archived_at": "2025-03-07T11:07:11.373105434"
|
||||
},
|
||||
{
|
||||
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
|
||||
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
|
||||
"ancestor_retain_lsn": "0/1F79038",
|
||||
"archived_at": "2025-03-05T11:10:22.257901390"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
let expected = TenantManifest {
|
||||
version: 2,
|
||||
stripe_size: Some(ShardStripeSize(32768)),
|
||||
offloaded_timelines: vec![
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_retain_lsn: None,
|
||||
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
|
||||
},
|
||||
OffloadedTimelineManifest {
|
||||
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
|
||||
ancestor_timeline_id: Some(TimelineId::from_str(
|
||||
"5c4df612fd159e63c1b7853fe94d97da",
|
||||
)?),
|
||||
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
|
||||
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
|
||||
},
|
||||
],
|
||||
};
|
||||
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,6 +61,7 @@ pub(crate) async fn upload_index_part(
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
/// Serializes and uploads the given tenant manifest data to the remote storage.
|
||||
pub(crate) async fn upload_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
@@ -76,16 +77,14 @@ pub(crate) async fn upload_tenant_manifest(
|
||||
});
|
||||
pausable_failpoint!("before-upload-manifest-pausable");
|
||||
|
||||
let serialized = tenant_manifest.to_json_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
let tenant_manifest_site = serialized.len();
|
||||
|
||||
let serialized = Bytes::from(tenant_manifest.to_json_bytes()?);
|
||||
let tenant_manifest_size = serialized.len();
|
||||
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
|
||||
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(serialized))),
|
||||
tenant_manifest_site,
|
||||
tenant_manifest_size,
|
||||
&remote_path,
|
||||
cancel,
|
||||
)
|
||||
|
||||
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
|
||||
}
|
||||
|
||||
/// Uniquely identify a layer visit by the layer
|
||||
/// and LSN floor (or start LSN) of the reads.
|
||||
/// The layer itself is not enough since we may
|
||||
/// have different LSN lower bounds for delta layer reads.
|
||||
/// and LSN range of the reads. Note that the end of the range is exclusive.
|
||||
///
|
||||
/// The layer itself is not enough since we may have different LSN lower
|
||||
/// bounds for delta layer reads. Scenarios where this can happen are:
|
||||
///
|
||||
/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
|
||||
/// and a query that only partially hits the image layer. Part of the query
|
||||
/// needs to read the whole in-memory layer and the other part needs to read
|
||||
/// only up to the image layer. Hence, they'll have different LSN floor values
|
||||
/// for the read.
|
||||
///
|
||||
/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
|
||||
/// The start LSN for one range is inside a layer and the start LSN for another range
|
||||
/// Is above the layer (includes all of it). Both ranges need to read the layer all the
|
||||
/// Way to the end but starting at different points. Hence, they'll have different LSN
|
||||
/// Ceil values.
|
||||
///
|
||||
/// The implication is that we might visit the same layer multiple times
|
||||
/// in order to read different LSN ranges from it. In practice, this isn't very concerning
|
||||
/// because:
|
||||
/// 1. Layer overlaps are rare and generally not intended
|
||||
/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
|
||||
/// are grouped tightly enough (likely the case).
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
struct LayerToVisitId {
|
||||
layer_id: LayerId,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
@@ -805,6 +826,7 @@ impl LayerFringe {
|
||||
let layer_to_visit_id = LayerToVisitId {
|
||||
layer_id: layer.id(),
|
||||
lsn_floor: lsn_range.start,
|
||||
lsn_ceil: lsn_range.end,
|
||||
};
|
||||
|
||||
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
|
||||
|
||||
@@ -366,7 +366,7 @@ impl SplitDeltaLayerWriter {
|
||||
)
|
||||
.await?;
|
||||
let (start_key, prev_delta_writer) =
|
||||
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
||||
self.inner.replace((key, next_delta_writer)).unwrap();
|
||||
self.batches.add_unfinished_delta_writer(
|
||||
prev_delta_writer,
|
||||
start_key..key,
|
||||
|
||||
@@ -1192,7 +1192,7 @@ mod test {
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
@@ -1235,7 +1235,7 @@ mod test {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
shard_count,
|
||||
ShardStripeSize(0x8000),
|
||||
ShardStripeSize(0x800),
|
||||
)
|
||||
.unwrap();
|
||||
let harness = TenantHarness::create_custom(
|
||||
@@ -1287,12 +1287,12 @@ mod test {
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
assert_eq!(original_size, 122880);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
@@ -1300,7 +1300,7 @@ mod test {
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
@@ -1309,19 +1309,19 @@ mod test {
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
assert_eq!(wrote_keys, 0x1000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
assert_eq!(replacement.metadata().file_size, 73728);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
@@ -766,7 +766,7 @@ mod tests {
|
||||
rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
|
||||
Ok((dst, len))
|
||||
}
|
||||
Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
|
||||
Err(e) => Err(std::io::Error::other(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ impl LayerIterRef<'_> {
|
||||
/// 1. Unified iterator for image and delta layers.
|
||||
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
|
||||
/// 3. Lazy creation of the real delta/image iterator.
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub(crate) enum IteratorWrapper<'a> {
|
||||
NotLoaded {
|
||||
ctx: &'a RequestContext,
|
||||
|
||||
@@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run += 1;
|
||||
let backoff =
|
||||
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
|
||||
log_compaction_error(
|
||||
&err,
|
||||
Some((error_run, backoff)),
|
||||
cancel.is_cancelled(),
|
||||
false,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error(
|
||||
err: &CompactionError,
|
||||
retry_info: Option<(u32, Duration)>,
|
||||
task_cancelled: bool,
|
||||
degrade_to_warning: bool,
|
||||
) {
|
||||
use CompactionError::*;
|
||||
|
||||
@@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error(
|
||||
}
|
||||
} else {
|
||||
match level {
|
||||
Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
|
||||
Level::ERROR => error!("Compaction failed: {err:#}"),
|
||||
Level::INFO => info!("Compaction failed: {err:#}"),
|
||||
level => unimplemented!("unexpected level {level:?}"),
|
||||
|
||||
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, Result, anyhow, bail, ensure};
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
@@ -115,7 +116,7 @@ use crate::pgdatadir_mapping::{
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::{LayerMap, SearchResult};
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
@@ -584,7 +585,7 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
|
||||
#[error("{0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
@@ -689,16 +690,23 @@ impl std::fmt::Display for ReadPath {
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
keyspace: KeySpace,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
query: Option<VersionedKeySpaceQuery>,
|
||||
// This is largest request LSN from the get page request batch
|
||||
original_hwm_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
/// Debug information about the read path if there's an error
|
||||
read_path: Option<ReadPath>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
impl MissingKeyError {
|
||||
fn enrich(&mut self, query: VersionedKeySpaceQuery) {
|
||||
self.query = Some(query);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
@@ -709,14 +717,18 @@ impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
"could not find data for key {} (shard {:?}), original HWM LSN {}",
|
||||
self.keyspace, self.shard, self.original_hwm_lsn
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if let Some(ref query) = self.query {
|
||||
write!(f, ", query {}", query)?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
}
|
||||
@@ -816,7 +828,7 @@ pub(crate) enum GetVectoredError {
|
||||
InvalidLsn(Lsn),
|
||||
|
||||
#[error("requested key not found: {0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
|
||||
#[error("ancestry walk")]
|
||||
GetReadyAncestorError(#[source] GetReadyAncestorError),
|
||||
@@ -927,7 +939,7 @@ impl std::fmt::Debug for Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
pub(crate) enum WaitLsnError {
|
||||
// Called on a timeline which is shutting down
|
||||
#[error("Shutdown")]
|
||||
@@ -1039,6 +1051,7 @@ pub(crate) enum ShutdownMode {
|
||||
Hard,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
enum ImageLayerCreationOutcome {
|
||||
/// We generated an image layer
|
||||
Generated {
|
||||
@@ -1126,14 +1139,12 @@ impl Timeline {
|
||||
// page_service.
|
||||
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![key..key.next()],
|
||||
};
|
||||
|
||||
let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
let key_value = vectored_res?.pop_first();
|
||||
@@ -1151,15 +1162,17 @@ impl Timeline {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
})),
|
||||
None => Err(PageReconstructError::MissingKey(Box::new(
|
||||
MissingKeyError {
|
||||
keyspace: KeySpace::single(key..key.next()),
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
original_hwm_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
query: None,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1172,21 +1185,18 @@ impl Timeline {
|
||||
/// which actually vectorizes the read path.
|
||||
pub(crate) async fn get_vectored(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
query: VersionedKeySpaceQuery,
|
||||
io_concurrency: super::storage_layer::IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
let total_keyspace = query.total_keyspace();
|
||||
|
||||
let key_count = keyspace.total_raw_size().try_into().unwrap();
|
||||
let key_count = total_keyspace.total_raw_size().try_into().unwrap();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
for range in &keyspace.ranges {
|
||||
for range in &total_keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
@@ -1195,9 +1205,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
trace!(
|
||||
"get vectored request for {:?}@{} from task kind {:?}",
|
||||
keyspace,
|
||||
lsn,
|
||||
"get vectored query {} from task kind {:?}",
|
||||
query,
|
||||
ctx.task_kind(),
|
||||
);
|
||||
|
||||
@@ -1206,12 +1215,7 @@ impl Timeline {
|
||||
.map(|metric| (metric, Instant::now()));
|
||||
|
||||
let res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(io_concurrency),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
|
||||
.await;
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
@@ -1262,13 +1266,10 @@ impl Timeline {
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(ScanLatencyOngoingRecording::start_recording);
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(io_concurrency),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
|
||||
.await;
|
||||
|
||||
if let Some(recording) = start {
|
||||
@@ -1280,18 +1281,27 @@ impl Timeline {
|
||||
|
||||
pub(super) async fn get_vectored_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
query: VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
|
||||
Some(ReadPath::new(keyspace.clone(), lsn))
|
||||
Some(ReadPath::new(
|
||||
query.total_keyspace(),
|
||||
query.high_watermark_lsn()?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
|
||||
RedoAttemptType::LegacyCompaction
|
||||
} else {
|
||||
RedoAttemptType::ReadPage
|
||||
};
|
||||
|
||||
let traversal_res: Result<(), _> = {
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
@@ -1303,7 +1313,7 @@ impl Timeline {
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
|
||||
self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
|
||||
.await
|
||||
};
|
||||
@@ -1316,6 +1326,13 @@ impl Timeline {
|
||||
.map(|state| state.collect_pending_ios())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
while collect_futs.next().await.is_some() {}
|
||||
|
||||
// Enrich the missing key error with the original query.
|
||||
if let GetVectoredError::MissingKey(mut missing_err) = err {
|
||||
missing_err.enrich(query.clone());
|
||||
return Err(GetVectoredError::MissingKey(missing_err));
|
||||
}
|
||||
|
||||
return Err(err);
|
||||
};
|
||||
|
||||
@@ -1333,6 +1350,8 @@ impl Timeline {
|
||||
|
||||
let futs = FuturesUnordered::new();
|
||||
for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
let req_lsn_for_key = query.map_key_to_lsn(&key);
|
||||
|
||||
futs.push({
|
||||
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
|
||||
let ctx = RequestContextBuilder::from(&ctx)
|
||||
@@ -1379,7 +1398,7 @@ impl Timeline {
|
||||
|
||||
let walredo_deltas = converted.num_deltas();
|
||||
let walredo_res = walredo_self
|
||||
.reconstruct_value(key, lsn, converted)
|
||||
.reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
@@ -1406,15 +1425,18 @@ impl Timeline {
|
||||
// to avoid infinite results.
|
||||
if !results.is_empty() {
|
||||
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
|
||||
let total_keyspace = query.total_keyspace();
|
||||
let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
|
||||
|
||||
static LOG_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
|
||||
LOG_PACER.lock().unwrap().call(|| {
|
||||
let num_keys = keyspace.total_raw_size();
|
||||
let num_keys = total_keyspace.total_raw_size();
|
||||
let num_pages = results.len();
|
||||
tracing::info!(
|
||||
shard_id = %self.tenant_shard_id.shard_slug(),
|
||||
lsn = %lsn,
|
||||
"Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
|
||||
lsn = %max_request_lsn,
|
||||
"Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
|
||||
);
|
||||
});
|
||||
}
|
||||
@@ -1940,7 +1962,7 @@ impl Timeline {
|
||||
)
|
||||
.await;
|
||||
if let Err(err) = &res {
|
||||
log_compaction_error(err, None, cancel.is_cancelled());
|
||||
log_compaction_error(err, None, cancel.is_cancelled(), false);
|
||||
}
|
||||
res
|
||||
}
|
||||
@@ -2715,6 +2737,10 @@ impl Timeline {
|
||||
.tenant_conf
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
|
||||
let gc_compaction_verification = tenant_conf
|
||||
.tenant_conf
|
||||
.gc_compaction_verification
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
|
||||
let gc_compaction_initial_threshold_kb = tenant_conf
|
||||
.tenant_conf
|
||||
.gc_compaction_initial_threshold_kb
|
||||
@@ -2729,6 +2755,7 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
|
||||
GcCompactionCombinedSettings {
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
}
|
||||
@@ -3927,6 +3954,154 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Type representing a query in the ([`Lsn`], [`Key`]) space.
|
||||
/// In other words, a set of segments in a 2D space.
|
||||
///
|
||||
/// This representation has the advatange of avoiding hash map
|
||||
/// allocations for uniform queries.
|
||||
pub(crate) enum VersionedKeySpaceQuery {
|
||||
/// Variant for queries at a single [`Lsn`]
|
||||
Uniform { keyspace: KeySpace, lsn: Lsn },
|
||||
/// Variant for queries at multiple [`Lsn`]s
|
||||
Scattered {
|
||||
keyspaces_at_lsn: Vec<(Lsn, KeySpace)>,
|
||||
},
|
||||
}
|
||||
|
||||
impl VersionedKeySpaceQuery {
|
||||
pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self {
|
||||
Self::Uniform { keyspace, lsn }
|
||||
}
|
||||
|
||||
pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self {
|
||||
Self::Scattered { keyspaces_at_lsn }
|
||||
}
|
||||
|
||||
/// Returns the most recent (largest) LSN included in the query.
|
||||
/// If any of the LSNs included in the query are invalid, returns
|
||||
/// an error instead.
|
||||
fn high_watermark_lsn(&self) -> Result<Lsn, GetVectoredError> {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(*lsn));
|
||||
}
|
||||
|
||||
Ok(*lsn)
|
||||
}
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
let mut max_lsn = None;
|
||||
for (lsn, _keyspace) in keyspaces_at_lsn.iter() {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(*lsn));
|
||||
}
|
||||
max_lsn = std::cmp::max(max_lsn, Some(lsn));
|
||||
}
|
||||
|
||||
if let Some(computed) = max_lsn {
|
||||
Ok(*computed)
|
||||
} else {
|
||||
Err(GetVectoredError::Other(anyhow!("empty input")))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the total keyspace being queried: the result of projecting
|
||||
/// everything in the key dimensions onto the key axis.
|
||||
fn total_keyspace(&self) -> KeySpace {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.clone(),
|
||||
Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
|
||||
.iter()
|
||||
.map(|(_lsn, keyspace)| keyspace)
|
||||
.fold(KeySpace::default(), |mut acc, v| {
|
||||
acc.merge(v);
|
||||
acc
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns LSN for a specific key.
|
||||
///
|
||||
/// Invariant: requested key must be part of [`Self::total_keyspace`]
|
||||
fn map_key_to_lsn(&self, key: &Key) -> Lsn {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => *lsn,
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
keyspaces_at_lsn
|
||||
.iter()
|
||||
.find(|(_lsn, keyspace)| keyspace.contains(key))
|
||||
.expect("Returned key was requested")
|
||||
.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove any parts of the query (segments) which overlap with the provided
|
||||
/// key space (also segments).
|
||||
fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove),
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||
keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| {
|
||||
let removed = keyspace.remove_overlapping_with(to_remove);
|
||||
removed_accum.add_keyspace(removed);
|
||||
});
|
||||
|
||||
removed_accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.is_empty(),
|
||||
Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
|
||||
.iter()
|
||||
.all(|(_lsn, keyspace)| keyspace.is_empty()),
|
||||
}
|
||||
}
|
||||
|
||||
/// "Lower" the query on the LSN dimension
|
||||
fn lower(&mut self, to: Lsn) {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => {
|
||||
// If the originally requested LSN is smaller than the starting
|
||||
// LSN of the ancestor we are descending into, we need to respect that.
|
||||
// Hence the min.
|
||||
*lsn = std::cmp::min(*lsn, to);
|
||||
}
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| {
|
||||
*lsn = std::cmp::min(*lsn, to);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VersionedKeySpaceQuery {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
|
||||
match self {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
write!(f, "{keyspace} @ {lsn}")?;
|
||||
}
|
||||
VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
|
||||
for (lsn, keyspace) in keyspaces_at_lsn.iter() {
|
||||
write!(f, "{keyspace} @ {lsn},")?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write!(f, "]")
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
#[allow(clippy::doc_lazy_continuation)]
|
||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||
@@ -3941,16 +4116,15 @@ impl Timeline {
|
||||
/// 2.4. If the fringe is empty, go back to 1
|
||||
async fn get_vectored_reconstruct_data(
|
||||
&self,
|
||||
mut keyspace: KeySpace,
|
||||
request_lsn: Lsn,
|
||||
mut query: VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let original_hwm_lsn = query.high_watermark_lsn().unwrap();
|
||||
|
||||
let mut timeline_owned: Arc<Timeline>;
|
||||
let mut timeline = self;
|
||||
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
|
||||
let missing_keyspace = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
@@ -3967,15 +4141,14 @@ impl Timeline {
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_IO_TIMELINE",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
high_watermark_lsn = %query.high_watermark_lsn().unwrap(),
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
&query,
|
||||
reconstruct_state,
|
||||
&self.cancel,
|
||||
&ctx,
|
||||
@@ -3984,23 +4157,23 @@ impl Timeline {
|
||||
.await?
|
||||
};
|
||||
|
||||
keyspace.remove_overlapping_with(&completed);
|
||||
query.remove_overlapping_with(&completed);
|
||||
|
||||
// Do not descend into the ancestor timeline for aux files.
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
keyspace.remove_overlapping_with(&KeySpace {
|
||||
query.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved
|
||||
if keyspace.is_empty() {
|
||||
if query.is_empty() {
|
||||
break None;
|
||||
}
|
||||
|
||||
let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
break Some(keyspace);
|
||||
break Some(query.total_keyspace());
|
||||
};
|
||||
|
||||
// Now we see if there are keys covered by the image layer but does not exist in the
|
||||
@@ -4011,7 +4184,7 @@ impl Timeline {
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
let mut removed = query.remove_overlapping_with(&image_covered_keyspace);
|
||||
// Do not fire missing key error and end early for sparse keys. Note that we hava already removed
|
||||
// non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
|
||||
// figuring out what is the inherited key range and do a fine-grained pruning.
|
||||
@@ -4021,11 +4194,11 @@ impl Timeline {
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
// If we reached this point, `remove_overlapping_with` should not have made any change to the
|
||||
// keyspace.
|
||||
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
// Each key range in the original query is at some point in the LSN space.
|
||||
// When descending into the ancestor, lower all ranges in the LSN space
|
||||
// such that new changes on the parent timeline are not visible.
|
||||
query.lower(timeline.ancestor_lsn);
|
||||
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
@@ -4034,7 +4207,6 @@ impl Timeline {
|
||||
parent: crnt_perf_span,
|
||||
"GET_ANCESTOR",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
ancestor = %ancestor_timeline.timeline_id,
|
||||
ancestor_lsn = %timeline.ancestor_lsn
|
||||
)
|
||||
@@ -4064,22 +4236,47 @@ impl Timeline {
|
||||
};
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&missing_keyspace.start().unwrap()),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError {
|
||||
keyspace: missing_keyspace, /* better if we can store the full keyspace */
|
||||
shard: self.shard_identity.number,
|
||||
original_hwm_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
backtrace: None,
|
||||
read_path: std::mem::take(&mut reconstruct_state.read_path),
|
||||
}));
|
||||
query: None,
|
||||
})));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_vectored_init_fringe(
|
||||
&self,
|
||||
query: &VersionedKeySpaceQuery,
|
||||
) -> Result<LayerFringe, GetVectoredError> {
|
||||
let mut fringe = LayerFringe::new();
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
match query {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
// LSNs requested by the compute or determined by the pageserver
|
||||
// are inclusive. Queries to the layer map use exclusive LSNs.
|
||||
// Hence, bump the value before the query - same in the other
|
||||
// match arm.
|
||||
let cont_lsn = Lsn(lsn.0 + 1);
|
||||
guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?;
|
||||
}
|
||||
VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
|
||||
for (lsn, keyspace) in keyspaces_at_lsn.iter() {
|
||||
let cont_lsn_for_keyspace = Lsn(lsn.0 + 1);
|
||||
guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(fringe)
|
||||
}
|
||||
|
||||
/// Collect the reconstruct data for a keyspace from the specified timeline.
|
||||
///
|
||||
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
|
||||
@@ -4098,18 +4295,11 @@ impl Timeline {
|
||||
/// decides how to deal with these two keyspaces.
|
||||
async fn get_vectored_reconstruct_data_timeline(
|
||||
timeline: &Timeline,
|
||||
keyspace: KeySpace,
|
||||
mut cont_lsn: Lsn,
|
||||
query: &VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<TimelineVisitOutcome, GetVectoredError> {
|
||||
let mut unmapped_keyspace = keyspace.clone();
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
// Prevent GC from progressing while visiting the current timeline.
|
||||
// If we are GC-ing because a new image layer was added while traversing
|
||||
// the timeline, then it will remove layers that are required for fulfilling
|
||||
@@ -4120,11 +4310,37 @@ impl Timeline {
|
||||
// See `compaction::compact_with_gc` for why we need this.
|
||||
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
|
||||
|
||||
loop {
|
||||
// Initialize the fringe
|
||||
let mut fringe = timeline.get_vectored_init_fringe(query).await?;
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
|
||||
while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
|
||||
// Visit the layer and plan IOs for it
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut unmapped_keyspace = keyspace_to_read;
|
||||
let cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
|
||||
let (keys_done_last_step, keys_with_image_coverage) =
|
||||
reconstruct_state.consume_done_keys();
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
@@ -4135,31 +4351,15 @@ impl Timeline {
|
||||
image_covered_keyspace.add_range(keys_with_image_coverage);
|
||||
}
|
||||
|
||||
// Query the layer map for the next layers to read.
|
||||
//
|
||||
// Do not descent any further if the last layer we visited
|
||||
// completed all keys in the keyspace it inspected. This is not
|
||||
// required for correctness, but avoids visiting extra layers
|
||||
// which turns out to be a perf bottleneck in some cases.
|
||||
if !unmapped_keyspace.is_empty() {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
guard.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
// The fringe keeps readable handles for the layers which are safe to read even
|
||||
@@ -4173,28 +4373,6 @@ impl Timeline {
|
||||
// at two different time points.
|
||||
drop(guard);
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TimelineVisitOutcome {
|
||||
@@ -4994,13 +5172,11 @@ impl Timeline {
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn);
|
||||
|
||||
let results = self
|
||||
.get_vectored(
|
||||
key_request_accum.consume_keyspace(),
|
||||
lsn,
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -5089,7 +5265,11 @@ impl Timeline {
|
||||
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
|
||||
// not contain too many keys, otherwise this takes a lot of memory.
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(
|
||||
VersionedKeySpaceQuery::uniform(partition.clone(), lsn),
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_keys_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
@@ -6357,10 +6537,17 @@ impl Timeline {
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
|
||||
let fire_critical_error = match redo_attempt_type {
|
||||
RedoAttemptType::ReadPage => true,
|
||||
RedoAttemptType::LegacyCompaction => true,
|
||||
RedoAttemptType::GcCompaction => false,
|
||||
};
|
||||
|
||||
// If we have a page image, and no WAL, we're all set
|
||||
if data.records.is_empty() {
|
||||
if let Some((img_lsn, img)) = &data.img {
|
||||
@@ -6407,13 +6594,22 @@ impl Timeline {
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.request_redo(
|
||||
key,
|
||||
request_lsn,
|
||||
data.img,
|
||||
data.records,
|
||||
self.pg_version,
|
||||
redo_attempt_type,
|
||||
)
|
||||
.await;
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(err)) => {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
if fire_critical_error {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
}
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
err.context("reconstruct a page image"),
|
||||
));
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
@@ -16,6 +16,8 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
@@ -78,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId {
|
||||
|
||||
pub struct GcCompactionCombinedSettings {
|
||||
pub gc_compaction_enabled: bool,
|
||||
pub gc_compaction_verification: bool,
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
pub gc_compaction_ratio_percent: u64,
|
||||
}
|
||||
@@ -223,6 +226,7 @@ impl GcCompactionQueue {
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
..
|
||||
} = timeline.get_gc_compaction_settings();
|
||||
if !gc_compaction_enabled {
|
||||
return Ok(());
|
||||
@@ -315,6 +319,9 @@ impl GcCompactionQueue {
|
||||
flags: {
|
||||
let mut flags = EnumSet::new();
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
if timeline.get_compaction_l0_first() {
|
||||
flags |= CompactFlags::YieldForL0;
|
||||
}
|
||||
flags
|
||||
},
|
||||
sub_compaction: true,
|
||||
@@ -448,7 +455,7 @@ impl GcCompactionQueue {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
|
||||
if let Err(err) = &res {
|
||||
log_compaction_error(err, None, cancel.is_cancelled());
|
||||
log_compaction_error(err, None, cancel.is_cancelled(), true);
|
||||
}
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
@@ -783,6 +790,114 @@ impl KeyHistoryRetention {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Verify if every key in the retention is readable by replaying the logs.
|
||||
async fn verify(
|
||||
&self,
|
||||
key: Key,
|
||||
base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
|
||||
full_history: &[(Key, Lsn, Value)],
|
||||
tline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Usually the min_lsn should be the first record but we do a full iteration to be safe.
|
||||
let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
|
||||
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
|
||||
return Ok(());
|
||||
};
|
||||
let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
|
||||
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
|
||||
return Ok(());
|
||||
};
|
||||
let mut base_img = base_img_from_ancestor
|
||||
.as_ref()
|
||||
.map(|(_, lsn, img)| (*lsn, img));
|
||||
let mut history = Vec::new();
|
||||
|
||||
async fn collect_and_verify(
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: &Option<(Lsn, &Bytes)>,
|
||||
history: &[(Lsn, &NeonWalRecord)],
|
||||
tline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut records = history
|
||||
.iter()
|
||||
.map(|(lsn, val)| (*lsn, (*val).clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// WAL redo requires records in the reverse LSN order
|
||||
records.reverse();
|
||||
let data = ValueReconstructState {
|
||||
img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
|
||||
records,
|
||||
};
|
||||
|
||||
tline
|
||||
.reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
|
||||
.await
|
||||
.with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
|
||||
for (lsn, val) in logs {
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
base_img = Some((*lsn, img));
|
||||
history.clear();
|
||||
}
|
||||
Value::WalRecord(rec) if val.will_init() => {
|
||||
base_img = None;
|
||||
history.clear();
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
}
|
||||
}
|
||||
if *retain_lsn >= min_lsn {
|
||||
// Only verify after the key appears in the full history for the first time.
|
||||
|
||||
if base_img.is_none() && history.is_empty() {
|
||||
anyhow::bail!(
|
||||
"verificatoin failed: key {} has no history at {}",
|
||||
key,
|
||||
retain_lsn
|
||||
);
|
||||
};
|
||||
// We don't modify history: in theory, we could replace the history with a single
|
||||
// image as in `generate_key_retention` to make redos at later LSNs faster. But we
|
||||
// want to verify everything as if they are read from the real layer map.
|
||||
collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
|
||||
}
|
||||
}
|
||||
|
||||
for (lsn, val) in &self.above_horizon.0 {
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
// Above the GC horizon, we verify every time we see an image.
|
||||
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
|
||||
base_img = Some((*lsn, img));
|
||||
history.clear();
|
||||
}
|
||||
Value::WalRecord(rec) if val.will_init() => {
|
||||
// Above the GC horizon, we verify every time we see an init record.
|
||||
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
|
||||
base_img = None;
|
||||
history.clear();
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Ensure the latest record is readable.
|
||||
collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
@@ -819,15 +934,16 @@ pub struct CompactionStatistics {
|
||||
time_acquire_lock_secs: f64,
|
||||
time_analyze_secs: f64,
|
||||
time_download_layer_secs: f64,
|
||||
time_to_first_kv_pair_secs: f64,
|
||||
time_main_loop_secs: f64,
|
||||
time_final_phase_secs: f64,
|
||||
time_total_secs: f64,
|
||||
|
||||
// Summary
|
||||
/// Ratio of the key-value size before/after gc-compaction.
|
||||
uncompressed_size_ratio: f64,
|
||||
/// Ratio of the physical size before/after gc-compaction.
|
||||
physical_size_ratio: f64,
|
||||
/// Ratio of the key-value size after/before gc-compaction.
|
||||
uncompressed_retention_ratio: f64,
|
||||
/// Ratio of the physical size after/before gc-compaction.
|
||||
compressed_retention_ratio: f64,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
@@ -896,15 +1012,15 @@ impl CompactionStatistics {
|
||||
fn finalize(&mut self) {
|
||||
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
|
||||
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
|
||||
self.uncompressed_size_ratio =
|
||||
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
self.uncompressed_retention_ratio =
|
||||
produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
|
||||
let produced_physical_size = self.image_layer_produced.size
|
||||
+ self.delta_layer_produced.size
|
||||
+ self.image_layer_discarded.size
|
||||
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
|
||||
self.physical_size_ratio =
|
||||
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
|
||||
self.compressed_retention_ratio =
|
||||
produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1113,7 +1229,17 @@ impl Timeline {
|
||||
// being potentially much longer.
|
||||
let rewrite_max = partition_count;
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
let outcome = self
|
||||
.compact_shard_ancestors(
|
||||
rewrite_max,
|
||||
options.flags.contains(CompactFlags::YieldForL0),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
match outcome {
|
||||
CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
|
||||
CompactionOutcome::Done | CompactionOutcome::Skipped => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(CompactionOutcome::Done)
|
||||
@@ -1130,8 +1256,10 @@ impl Timeline {
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
yield_for_l0: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let mut outcome = CompactionOutcome::Done;
|
||||
let mut drop_layers = Vec::new();
|
||||
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
@@ -1142,12 +1270,7 @@ impl Timeline {
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map()?.iter_historic_layers() {
|
||||
@@ -1165,8 +1288,8 @@ impl Timeline {
|
||||
// This ancestral layer only covers keys that belong to other shards.
|
||||
// We include the full metadata in the log: if we had some critical bug that caused
|
||||
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard.",
|
||||
debug!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard",
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
@@ -1228,19 +1351,35 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if layers_to_rewrite.len() >= rewrite_max {
|
||||
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
layers_to_rewrite.len()
|
||||
);
|
||||
continue;
|
||||
outcome = CompactionOutcome::Pending;
|
||||
break;
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
layers_to_rewrite.push(layer);
|
||||
}
|
||||
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O.
|
||||
drop(layers);
|
||||
|
||||
// Drop out early if there's nothing to do.
|
||||
if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
|
||||
(latest_gc_cutoff={} pitr_cutoff={})",
|
||||
layers_to_rewrite.len(),
|
||||
drop_layers.len(),
|
||||
*latest_gc_cutoff,
|
||||
pitr_cutoff,
|
||||
);
|
||||
let started = Instant::now();
|
||||
|
||||
let mut replace_image_layers = Vec::new();
|
||||
|
||||
for layer in layers_to_rewrite {
|
||||
@@ -1248,7 +1387,7 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
|
||||
info!(layer=%layer, "rewriting layer after shard split");
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -1286,7 +1425,7 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
|
||||
@@ -1296,6 +1435,26 @@ impl Timeline {
|
||||
// the layer has no data for us with the ShardedRange check above, but
|
||||
drop_layers.push(layer);
|
||||
}
|
||||
|
||||
// Yield for L0 compaction if necessary, but make sure we update the layer map below
|
||||
// with the work we've already done.
|
||||
if yield_for_l0
|
||||
&& self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some()
|
||||
{
|
||||
info!("shard ancestor compaction yielding for L0 compaction");
|
||||
outcome = CompactionOutcome::YieldForL0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for layer in &drop_layers {
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split (no keys for this shard)",
|
||||
);
|
||||
}
|
||||
|
||||
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
|
||||
@@ -1313,17 +1472,36 @@ impl Timeline {
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
|
||||
// load.
|
||||
match self.remote_client.wait_completion().await {
|
||||
Ok(()) => (),
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
if outcome != CompactionOutcome::YieldForL0 {
|
||||
info!("shard ancestor compaction waiting for uploads");
|
||||
tokio::select! {
|
||||
result = self.remote_client.wait_completion() => match result {
|
||||
Ok(()) => {},
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
},
|
||||
// Don't wait if there's L0 compaction to do. We don't need to update the outcome
|
||||
// here, because we've already done the actual work.
|
||||
_ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"shard ancestor compaction done in {:.3}s{}",
|
||||
started.elapsed().as_secs_f64(),
|
||||
match outcome {
|
||||
CompactionOutcome::Pending =>
|
||||
format!(", with pending work (rewrite_max={rewrite_max})"),
|
||||
CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
|
||||
CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
|
||||
}
|
||||
);
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-persistent");
|
||||
|
||||
Ok(())
|
||||
Ok(outcome)
|
||||
}
|
||||
|
||||
/// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
|
||||
@@ -2142,6 +2320,7 @@ impl Timeline {
|
||||
/// ```
|
||||
///
|
||||
/// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn generate_key_retention(
|
||||
self: &Arc<Timeline>,
|
||||
key: Key,
|
||||
@@ -2150,6 +2329,7 @@ impl Timeline {
|
||||
retain_lsn_below_horizon: &[Lsn],
|
||||
delta_threshold_cnt: usize,
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
verification: bool,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
|
||||
@@ -2236,8 +2416,8 @@ impl Timeline {
|
||||
"should have at least below + above horizon batches"
|
||||
);
|
||||
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
|
||||
if let Some((key, lsn, img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img)));
|
||||
if let Some((key, lsn, ref img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img.clone())));
|
||||
}
|
||||
|
||||
/// Generate debug information for the replay history
|
||||
@@ -2351,22 +2531,15 @@ impl Timeline {
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
let img_and_lsn = if produce_image {
|
||||
records_since_last_image = 0;
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2401,6 +2574,7 @@ impl Timeline {
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
}
|
||||
// WAL redo requires records in the reverse LSN order
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
@@ -2410,7 +2584,9 @@ impl Timeline {
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
let img = self
|
||||
.reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction)
|
||||
.await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
@@ -2431,10 +2607,16 @@ impl Timeline {
|
||||
assert_eq!(retention.len(), lsn_split_points.len() + 1);
|
||||
for (idx, logs) in retention.into_iter().enumerate() {
|
||||
if idx == lsn_split_points.len() {
|
||||
return Ok(KeyHistoryRetention {
|
||||
let retention = KeyHistoryRetention {
|
||||
below_horizon: result,
|
||||
above_horizon: KeyLogAtLsn(logs),
|
||||
});
|
||||
};
|
||||
if verification {
|
||||
retention
|
||||
.verify(key, &base_img_from_ancestor, full_history, self)
|
||||
.await?;
|
||||
}
|
||||
return Ok(retention);
|
||||
} else {
|
||||
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
|
||||
}
|
||||
@@ -2901,6 +3083,9 @@ impl Timeline {
|
||||
}
|
||||
(false, res)
|
||||
};
|
||||
|
||||
let verification = self.get_gc_compaction_settings().gc_compaction_verification;
|
||||
|
||||
info!(
|
||||
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
|
||||
job_desc.selected_layers.len(),
|
||||
@@ -3030,7 +3215,7 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let time_download_layer = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
let mut timer = Instant::now();
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
let mut accumulated_values = Vec::new();
|
||||
@@ -3105,8 +3290,7 @@ impl Timeline {
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut keys_processed = 0;
|
||||
let mut time_to_first_kv_pair = None;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
@@ -3114,13 +3298,16 @@ impl Timeline {
|
||||
.context("failed to get next key-value pair")
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
if time_to_first_kv_pair.is_none() {
|
||||
time_to_first_kv_pair = Some(timer.elapsed());
|
||||
timer = Instant::now();
|
||||
}
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
keys_processed += 1;
|
||||
let should_yield = yield_for_l0
|
||||
&& keys_processed % 1000 == 0
|
||||
&& self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
@@ -3215,6 +3402,7 @@ impl Timeline {
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
verification,
|
||||
)
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
@@ -3255,6 +3443,7 @@ impl Timeline {
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
verification,
|
||||
)
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
@@ -3451,6 +3640,9 @@ impl Timeline {
|
||||
let time_final_phase = timer.elapsed();
|
||||
|
||||
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
|
||||
stat.time_to_first_kv_pair_secs = time_to_first_kv_pair
|
||||
.unwrap_or(Duration::ZERO)
|
||||
.as_secs_f64();
|
||||
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
|
||||
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
|
||||
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
|
||||
@@ -3911,8 +4103,6 @@ impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
|
||||
}
|
||||
}
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
|
||||
impl CompactionLayer<Key> for ResidentDeltaLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.0.layer_desc().key_range
|
||||
|
||||
@@ -410,10 +410,13 @@ impl DeleteTimelineFlow {
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
tenant.store_tenant_manifest().await.map_err(|e| match e {
|
||||
TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
|
||||
_ => DeleteTimelineError::Other(e.into()),
|
||||
})?;
|
||||
tenant
|
||||
.maybe_upload_tenant_manifest()
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
|
||||
err => DeleteTimelineError::Other(err.into()),
|
||||
})?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
|
||||
AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::VersionedKeySpaceQuery;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
|
||||
}
|
||||
}
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
|
||||
let data = ancestor
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key_range.clone()),
|
||||
image_lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await
|
||||
.context("failed to retrieve aux keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
|
||||
@@ -3,17 +3,18 @@ use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::{AtomicLsn, Lsn};
|
||||
|
||||
use super::{ReadableLayer, TimelineWriterState};
|
||||
use super::{LayerFringe, ReadableLayer, TimelineWriterState};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
@@ -38,7 +39,7 @@ impl Default for LayerManager {
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
match weak {
|
||||
ReadableLayerWeak::PersistentLayer(desc) => {
|
||||
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
|
||||
@@ -147,6 +148,36 @@ impl LayerManager {
|
||||
self.layers().keys().cloned().collect_vec()
|
||||
}
|
||||
|
||||
/// Update the [`LayerFringe`] of a read request
|
||||
///
|
||||
/// Take a key space at a given LSN and query the layer map below each range
|
||||
/// of the key space to find the next layers to visit.
|
||||
pub(crate) fn update_search_fringe(
|
||||
&self,
|
||||
keyspace: &KeySpace,
|
||||
cont_lsn: Lsn,
|
||||
fringe: &mut LayerFringe,
|
||||
) -> Result<(), Shutdown> {
|
||||
let map = self.layer_map()?;
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let results = map.range_search(range.clone(), cont_lsn);
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
self.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
|
||||
@@ -111,7 +111,7 @@ pub(crate) async fn offload_timeline(
|
||||
// at the next restart attach it again.
|
||||
// For that to happen, we'd need to make the manifest reflect our *intended* state,
|
||||
// not our actual state of offloaded timelines.
|
||||
tenant.store_tenant_manifest().await?;
|
||||
tenant.maybe_upload_tenant_manifest().await?;
|
||||
|
||||
tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
|
||||
|
||||
|
||||
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Cancelled => Ok(()),
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
if cancellation.is_cancelled() {
|
||||
|
||||
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
ClosedGate,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::Cancelled => {
|
||||
debug!("Connection cancelled")
|
||||
}
|
||||
WalReceiverError::ClosedGate => {
|
||||
// doesn't happen at runtime
|
||||
}
|
||||
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
@@ -445,7 +454,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
@@ -577,7 +586,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable {
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant, reason = "TODO")]
|
||||
pub enum UploadQueueStopped {
|
||||
Deletable(UploadQueueStoppedDeletable),
|
||||
Uninitialized,
|
||||
|
||||
@@ -21,13 +21,13 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use std::backtrace::Backtrace;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -38,7 +38,7 @@ use postgres_ffi::{
|
||||
fsm_logical_to_physical, pg_constants,
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::bin_ser::{DeserializeError, SerializeError};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
@@ -104,12 +104,101 @@ struct WarnIngestLag {
|
||||
timestamp_invalid_msg_ratelimit: RateLimit,
|
||||
}
|
||||
|
||||
pub struct WalIngestError {
|
||||
pub backtrace: std::backtrace::Backtrace,
|
||||
pub kind: WalIngestErrorKind,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum WalIngestErrorKind {
|
||||
#[error(transparent)]
|
||||
#[allow(private_interfaces)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
#[error(transparent)]
|
||||
DeserializationFailure(#[from] DeserializeError),
|
||||
#[error(transparent)]
|
||||
SerializationFailure(#[from] SerializeError),
|
||||
#[error("the request contains data not supported by pageserver: {0} @ {1}")]
|
||||
InvalidKey(Key, Lsn),
|
||||
#[error("twophase file for xid {0} already exists")]
|
||||
FileAlreadyExists(u64),
|
||||
#[error("slru segment {0:?}/{1} already exists")]
|
||||
SlruAlreadyExists(SlruKind, u32),
|
||||
#[error("relation already exists")]
|
||||
RelationAlreadyExists(RelTag),
|
||||
#[error("invalid reldir key {0}")]
|
||||
InvalidRelDirKey(Key),
|
||||
|
||||
#[error(transparent)]
|
||||
LogicalError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
EncodeAuxFileError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
MaybeRelSizeV2Error(anyhow::Error),
|
||||
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl<T> From<T> for WalIngestError
|
||||
where
|
||||
WalIngestErrorKind: From<T>,
|
||||
{
|
||||
fn from(value: T) -> Self {
|
||||
WalIngestError {
|
||||
backtrace: Backtrace::capture(),
|
||||
kind: WalIngestErrorKind::from(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for WalIngestError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
self.kind.source()
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
self.kind.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
if f.alternate() {
|
||||
f.debug_map()
|
||||
.key(&"backtrace")
|
||||
.value(&self.backtrace)
|
||||
.key(&"kind")
|
||||
.value(&self.kind)
|
||||
.finish()
|
||||
} else {
|
||||
writeln!(f, "Error: {:?}", self.kind)?;
|
||||
if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! ensure_walingest {
|
||||
($($t:tt)*) => {
|
||||
_ = || -> Result<(), anyhow::Error> {
|
||||
anyhow::ensure!($($t)*);
|
||||
Ok(())
|
||||
}().map_err(WalIngestErrorKind::LogicalError)?;
|
||||
};
|
||||
}
|
||||
|
||||
impl WalIngest {
|
||||
pub async fn new(
|
||||
timeline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<WalIngest> {
|
||||
) -> Result<WalIngest, WalIngestError> {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
||||
@@ -145,7 +234,7 @@ impl WalIngest {
|
||||
interpreted: InterpretedWalRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
) -> Result<bool, WalIngestError> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
let prev_len = modification.len();
|
||||
|
||||
@@ -288,7 +377,7 @@ impl WalIngest {
|
||||
}
|
||||
|
||||
/// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
|
||||
let next_full_xid =
|
||||
enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
|
||||
|
||||
@@ -298,9 +387,9 @@ impl WalIngest {
|
||||
if xid > next_xid {
|
||||
// Wraparound occurred, must be from a prev epoch.
|
||||
if epoch == 0 {
|
||||
bail!(
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
|
||||
);
|
||||
)))?;
|
||||
}
|
||||
epoch -= 1;
|
||||
}
|
||||
@@ -313,7 +402,7 @@ impl WalIngest {
|
||||
clear_vm_bits: ClearVmBits,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
@@ -402,7 +491,7 @@ impl WalIngest {
|
||||
create: DbaseCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseCreate {
|
||||
db_id,
|
||||
tablespace_id,
|
||||
@@ -505,7 +594,7 @@ impl WalIngest {
|
||||
dbase_drop: DbaseDrop,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseDrop {
|
||||
db_id,
|
||||
tablespace_ids,
|
||||
@@ -523,7 +612,7 @@ impl WalIngest {
|
||||
create: SmgrCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let SmgrCreate { rel } = create;
|
||||
self.put_rel_creation(modification, rel, ctx).await?;
|
||||
Ok(())
|
||||
@@ -537,7 +626,7 @@ impl WalIngest {
|
||||
truncate: XlSmgrTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let XlSmgrTruncate {
|
||||
blkno,
|
||||
rnode,
|
||||
@@ -689,7 +778,7 @@ impl WalIngest {
|
||||
record: XactRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (xact_common, is_commit, is_prepared) = match record {
|
||||
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
|
||||
let xid: u64 = if modification.tline.pg_version >= 17 {
|
||||
@@ -813,7 +902,7 @@ impl WalIngest {
|
||||
truncate: ClogTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogTruncate {
|
||||
pageno,
|
||||
oldest_xid,
|
||||
@@ -889,7 +978,7 @@ impl WalIngest {
|
||||
zero_page: ClogZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogZeroPage { segno, rpageno } = zero_page;
|
||||
|
||||
self.put_slru_page_image(
|
||||
@@ -907,7 +996,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Create WAL record for updating the multixact-offsets page
|
||||
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -1010,7 +1099,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (maxsegment, startsegment, endsegment) =
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestMulti = xlrec.end_trunc_off;
|
||||
@@ -1058,7 +1147,7 @@ impl WalIngest {
|
||||
zero_page: MultiXactZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let MultiXactZeroPage {
|
||||
slru_kind,
|
||||
segno,
|
||||
@@ -1080,7 +1169,7 @@ impl WalIngest {
|
||||
update: RelmapUpdate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RelmapUpdate { update, buf } = update;
|
||||
|
||||
modification
|
||||
@@ -1093,7 +1182,7 @@ impl WalIngest {
|
||||
raw_record: RawXlogRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RawXlogRecord { info, lsn, mut buf } = raw_record;
|
||||
let pg_version = modification.tline.pg_version;
|
||||
|
||||
@@ -1235,12 +1324,12 @@ impl WalIngest {
|
||||
put: PutLogicalMessage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let PutLogicalMessage { path, buf } = put;
|
||||
modification.put_file(path.as_str(), &buf, ctx).await
|
||||
}
|
||||
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
StandbyRecord::RunningXacts(running_xacts) => {
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
@@ -1258,7 +1347,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
record: ReploriginRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
ReploriginRecord::Set(set) => {
|
||||
modification
|
||||
@@ -1278,7 +1367,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1291,7 +1380,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_page_image(rel, blknum, img)?;
|
||||
@@ -1305,7 +1394,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_wal_record(rel, blknum, rec)?;
|
||||
@@ -1318,7 +1407,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_truncation(rel, nblocks, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1329,7 +1418,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let new_nblocks = blknum + 1;
|
||||
// Check if the relation exists. We implicitly create relations on first
|
||||
// record.
|
||||
@@ -1423,7 +1512,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.shard.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1441,7 +1530,7 @@ impl WalIngest {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// we don't use a cache for this like we do for relations. SLRUS are explcitly
|
||||
// extended with ZEROPAGE records, not with commit records, so it happens
|
||||
// a lot less frequently.
|
||||
@@ -1509,6 +1598,7 @@ async fn get_relsize(
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use super::*;
|
||||
@@ -1530,7 +1620,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
|
||||
for i in 14..=16 {
|
||||
dispatch_pgversion!(i, {
|
||||
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
|
||||
|
||||
@@ -136,6 +136,16 @@ macro_rules! bail {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RedoAttemptType {
|
||||
/// Used for the read path. Will fire critical errors and retry twice if failure.
|
||||
ReadPage,
|
||||
// Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure.
|
||||
LegacyCompaction,
|
||||
// Used for gc compaction. Will not fire critical errors and not retry.
|
||||
GcCompaction,
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
@@ -156,11 +166,18 @@ impl PostgresRedoManager {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, Error> {
|
||||
if records.is_empty() {
|
||||
bail!("invalid WAL redo request with no records");
|
||||
}
|
||||
|
||||
let max_retry_attempts = match redo_attempt_type {
|
||||
RedoAttemptType::ReadPage => 2,
|
||||
RedoAttemptType::LegacyCompaction => 1,
|
||||
RedoAttemptType::GcCompaction => 0,
|
||||
};
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
let mut img = base_img.map(|p| p.1);
|
||||
let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
|
||||
@@ -180,6 +197,7 @@ impl PostgresRedoManager {
|
||||
&records[batch_start..i],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
)
|
||||
.await
|
||||
};
|
||||
@@ -201,6 +219,7 @@ impl PostgresRedoManager {
|
||||
&records[batch_start..],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
max_retry_attempts,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -424,11 +443,11 @@ impl PostgresRedoManager {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
max_retry_attempts: u32,
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let base_img = &base_img;
|
||||
@@ -486,7 +505,7 @@ impl PostgresRedoManager {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
if n_attempts > max_retry_attempts || result.is_ok() {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -560,6 +579,7 @@ mod tests {
|
||||
|
||||
use super::PostgresRedoManager;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
@@ -593,6 +613,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
@@ -621,6 +642,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
@@ -642,6 +664,7 @@ mod tests {
|
||||
None,
|
||||
short_records(),
|
||||
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
|
||||
@@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon(
|
||||
append,
|
||||
clear,
|
||||
will_init,
|
||||
only_if,
|
||||
} => {
|
||||
use bytes::BufMut;
|
||||
if *will_init {
|
||||
@@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon(
|
||||
if *clear {
|
||||
page.clear();
|
||||
}
|
||||
if let Some(only_if) = only_if {
|
||||
if page != only_if.as_bytes() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"the current image does not match the expected image, cannot append"
|
||||
));
|
||||
}
|
||||
}
|
||||
page.put_slice(append.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
communicator.o \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
|
||||
2504
pgxn/neon/communicator.c
Normal file
2504
pgxn/neon/communicator.c
Normal file
File diff suppressed because it is too large
Load Diff
48
pgxn/neon/communicator.h
Normal file
48
pgxn/neon/communicator.h
Normal file
@@ -0,0 +1,48 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* communicator.h
|
||||
* internal interface for communicating with remote pageservers
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef COMMUNICATOR_h
|
||||
#define COMMUNICATOR_h
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
|
||||
/* initialization at postmaster startup */
|
||||
extern void pg_init_communicator(void);
|
||||
|
||||
/* initialization at backend startup */
|
||||
extern void communicator_init(void);
|
||||
|
||||
extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
neon_request_lsns *request_lsns);
|
||||
extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum,
|
||||
neon_request_lsns *request_lsns);
|
||||
extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns);
|
||||
extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber base_blockno, neon_request_lsns *request_lsns,
|
||||
void **buffers, BlockNumber nblocks, const bits8 *mask);
|
||||
extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_request_lsns *lsns,
|
||||
BlockNumber nblocks, void **buffers, bits8 *mask);
|
||||
extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
BlockNumber nblocks, const bits8 *mask);
|
||||
extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
|
||||
neon_request_lsns *request_lsns,
|
||||
void *buffer);
|
||||
|
||||
extern void communicator_reconfigure_timeout_if_needed(void);
|
||||
extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
|
||||
|
||||
|
||||
#endif
|
||||
@@ -21,7 +21,6 @@
|
||||
#include "access/xlog.h"
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "pgstat.h"
|
||||
#include "port/pg_iovec.h"
|
||||
@@ -43,6 +42,7 @@
|
||||
|
||||
#include "hll.h"
|
||||
#include "bitmap.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
@@ -1563,8 +1563,12 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1592,16 +1596,19 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].forknum = entry->key.forkNum;
|
||||
fctx->record[n].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].forknum = entry->key.forkNum;
|
||||
fctx->record[n].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
52
pgxn/neon/file_cache.h
Normal file
52
pgxn/neon/file_cache.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* file_cache.h
|
||||
* Local File Cache definitions
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef FILE_CACHE_h
|
||||
#define FILE_CACHE_h
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
/* GUCs */
|
||||
extern bool lfc_store_prefetch_result;
|
||||
|
||||
/* functions for local file cache */
|
||||
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, const void *const *buffers,
|
||||
BlockNumber nblocks);
|
||||
/* returns number of blocks read, with one bit set in *read for each */
|
||||
extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, void **buffers,
|
||||
BlockNumber nblocks, bits8 *mask);
|
||||
|
||||
extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno);
|
||||
extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, int nblocks, bits8 *bitmap);
|
||||
extern void lfc_init(void);
|
||||
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
const void* buffer, XLogRecPtr lsn);
|
||||
|
||||
|
||||
static inline bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
void *buffer)
|
||||
{
|
||||
bits8 rv = 0;
|
||||
return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
const void *buffer)
|
||||
{
|
||||
return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
|
||||
}
|
||||
|
||||
#endif /* FILE_CACHE_H */
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user