mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 06:30:43 +00:00
Compare commits
3 Commits
problame/d
...
sk/proxy-w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dec7e33e5d | ||
|
|
0ad08eb138 | ||
|
|
238f9ad3c5 |
@@ -5,6 +5,7 @@
|
||||
!Cargo.lock
|
||||
!Makefile
|
||||
|
||||
!tmp-proxy-test/
|
||||
!.cargo/
|
||||
!.config/
|
||||
!control_plane/
|
||||
|
||||
18
.github/ansible/deploy.yaml
vendored
18
.github/ansible/deploy.yaml
vendored
@@ -91,15 +91,6 @@
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
# used in `pageserver.service` template
|
||||
- name: learn current availability_zone
|
||||
shell:
|
||||
cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
|
||||
register: ec2_availability_zone
|
||||
|
||||
- set_fact:
|
||||
ec2_availability_zone={{ ec2_availability_zone.stdout }}
|
||||
|
||||
- name: upload systemd service definition
|
||||
ansible.builtin.template:
|
||||
src: systemd/pageserver.service
|
||||
@@ -162,15 +153,6 @@
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
# used in `safekeeper.service` template
|
||||
- name: learn current availability_zone
|
||||
shell:
|
||||
cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
|
||||
register: ec2_availability_zone
|
||||
|
||||
- set_fact:
|
||||
ec2_availability_zone={{ ec2_availability_zone.stdout }}
|
||||
|
||||
# in the future safekeepers should discover pageservers byself
|
||||
# but currently use first pageserver that was discovered
|
||||
- name: set first pageserver var for safekeepers
|
||||
|
||||
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -27,8 +27,6 @@ storage:
|
||||
ansible_host: i-0cd8d316ecbb715be
|
||||
pageserver-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-090044ed3d383fef0
|
||||
pageserver-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-033584edf3f4b6742
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
8
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
8
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -8,14 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
# TODO: learn typical resident-size growth rate [GiB/minute] and configure
|
||||
# min_avail_bytes such that we have X minutes of headroom.
|
||||
min_avail_bytes: 0
|
||||
# We assume that the worst-case growth rate is small enough that we can
|
||||
# catch above-threshold conditions by checking every 10s.
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
|
||||
8
.github/ansible/staging.us-east-2.hosts.yaml
vendored
8
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -8,14 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
# TODO: learn typical resident-size growth rate [GiB/minute] and configure
|
||||
# min_avail_bytes such that we have X minutes of headroom.
|
||||
min_avail_bytes: 0
|
||||
# We assume that the worst-case growth rate is small enough that we can
|
||||
# catch above-threshold conditions by checking every 10s.
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
|
||||
2
.github/ansible/systemd/pageserver.service
vendored
2
.github/ansible/systemd/pageserver.service
vendored
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -c "availability_zone='{{ ec2_availability_zone }}'" -D /storage/pageserver/data
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
2
.github/ansible/systemd/safekeeper.service
vendored
2
.github/ansible/systemd/safekeeper.service
vendored
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' --availability-zone={{ ec2_availability_zone }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -552,7 +552,7 @@ jobs:
|
||||
neon-image-depot:
|
||||
# For testing this will run side-by-side for a few merges.
|
||||
# This action is not really optimized yet, but gets the job done
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
needs: [ tag ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
permissions:
|
||||
|
||||
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@@ -31,3 +31,4 @@ jobs:
|
||||
head: releases/${{ steps.date.outputs.date }}
|
||||
base: release
|
||||
title: Release ${{ steps.date.outputs.date }}
|
||||
team_reviewers: release
|
||||
|
||||
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -2474,7 +2474,6 @@ dependencies = [
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"tempfile",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
@@ -4535,7 +4534,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"rand",
|
||||
"regex",
|
||||
"routerify",
|
||||
"sentry",
|
||||
"serde",
|
||||
|
||||
26
Dockerfile
26
Dockerfile
@@ -44,7 +44,7 @@ COPY --chown=nonroot . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \
|
||||
&& mold -run cargo build --bin proxy --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -63,11 +63,11 @@ RUN set -e \
|
||||
&& useradd -d /data neon \
|
||||
&& chown -R neon:neon /data
|
||||
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
# COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
# COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
# COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
|
||||
# COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
# COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
@@ -76,13 +76,13 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoint='http://storage_broker:50051'" \
|
||||
-c "pg_distrib_dir='/usr/local/'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
# RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
# && /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
# -c "id=1234" \
|
||||
# -c "broker_endpoint='http://storage_broker:50051'" \
|
||||
# -c "pg_distrib_dir='/usr/local/'" \
|
||||
# -c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
# -c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
VOLUME ["/data"]
|
||||
USER neon
|
||||
|
||||
@@ -2,69 +2,26 @@
|
||||
|
||||
ARG SRC_IMAGE
|
||||
ARG VM_INFORMANT_VERSION=v0.1.14
|
||||
# on libcgroup update, make sure to check bootstrap.sh for changes
|
||||
ARG LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
# Pull VM informant, to copy from later
|
||||
# Pull VM informant and set up inittab
|
||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
||||
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ARG LIBCGROUP_VERSION
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run adduser, set inittab
|
||||
USER root
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
|
||||
RUN set -e \
|
||||
&& rm -f /etc/inittab \
|
||||
&& touch /etc/inittab
|
||||
|
||||
RUN set -e \
|
||||
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
|
||||
&& CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \
|
||||
&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
|
||||
&& ARGS="--auto-restart --pgconnstr=\"$CONNSTR\"" \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run adduser
|
||||
USER root
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
USER postgres
|
||||
|
||||
ADD vm-cgconfig.conf /etc/cgconfig.conf
|
||||
COPY --from=informant /etc/inittab /etc/inittab
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
|
||||
|
||||
@@ -46,14 +46,11 @@ postgresql-libs cmake postgresql protobuf
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
#### Installing dependencies on macOS (12.3.1)
|
||||
#### Installing dependencies on OSX (12.3.1)
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf openssl flex bison
|
||||
|
||||
# add openssl to PATH, required for ed25519 keys generation in neon_local
|
||||
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
|
||||
@@ -133,7 +133,6 @@ fn main() -> Result<()> {
|
||||
.settings
|
||||
.find("neon.pageserver_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
@@ -154,7 +153,6 @@ fn main() -> Result<()> {
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
storage_auth_token,
|
||||
metrics: ComputeMetrics::default(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
|
||||
@@ -18,7 +18,6 @@ use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
|
||||
@@ -45,7 +44,6 @@ pub struct ComputeNode {
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub storage_auth_token: Option<String>,
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
@@ -128,18 +126,7 @@ impl ComputeNode {
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token);
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
@@ -176,11 +163,6 @@ impl ComputeNode {
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
@@ -258,11 +240,6 @@ impl ComputeNode {
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", &self.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
|
||||
@@ -24,8 +24,6 @@ pub struct ComputeSpec {
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
[pageserver]
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
pg_auth_type = 'Trust'
|
||||
http_auth_type = 'Trust'
|
||||
auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
id = 1
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
[pageserver]
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
pg_auth_type = 'Trust'
|
||||
http_auth_type = 'Trust'
|
||||
auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
id = 1
|
||||
|
||||
@@ -53,15 +53,14 @@ listen_addr = '{DEFAULT_BROKER_ADDR}'
|
||||
id = {DEFAULT_PAGESERVER_ID}
|
||||
listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
|
||||
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
|
||||
pg_auth_type = '{trust_auth}'
|
||||
http_auth_type = '{trust_auth}'
|
||||
auth_type = '{pageserver_auth_type}'
|
||||
|
||||
[[safekeepers]]
|
||||
id = {DEFAULT_SAFEKEEPER_ID}
|
||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
||||
"#,
|
||||
trust_auth = AuthType::Trust,
|
||||
pageserver_auth_type = AuthType::Trust,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -628,7 +627,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
|
||||
let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));
|
||||
|
||||
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
|
||||
let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||
|
||||
Some(env.generate_auth_token(&claims)?)
|
||||
|
||||
@@ -11,6 +11,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -87,6 +88,7 @@ impl ComputeControlPlane {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test: false,
|
||||
timeline_id,
|
||||
lsn,
|
||||
tenant_id,
|
||||
@@ -95,7 +97,7 @@ impl ComputeControlPlane {
|
||||
});
|
||||
|
||||
node.create_pgdata()?;
|
||||
node.setup_pg_conf()?;
|
||||
node.setup_pg_conf(self.env.pageserver.auth_type)?;
|
||||
|
||||
self.nodes
|
||||
.insert((tenant_id, node.name.clone()), Arc::clone(&node));
|
||||
@@ -112,6 +114,7 @@ pub struct PostgresNode {
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenant_id: TenantId,
|
||||
@@ -169,6 +172,7 @@ impl PostgresNode {
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timeline_id,
|
||||
lsn: recovery_target_lsn,
|
||||
tenant_id,
|
||||
@@ -274,7 +278,7 @@ impl PostgresNode {
|
||||
|
||||
// Write postgresql.conf with default configuration
|
||||
// and PG_VERSION file to the data directory of a new node.
|
||||
fn setup_pg_conf(&self) -> Result<()> {
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
conf.append("wal_log_hints", "off");
|
||||
@@ -298,12 +302,29 @@ impl PostgresNode {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
// Set up authentication
|
||||
//
|
||||
// $NEON_AUTH_TOKEN will be replaced with value from environment
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::NeonJWT = auth_type {
|
||||
"$NEON_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
|
||||
// We parse this string and build it back with token from env var, and for simplicity rebuild
|
||||
// uses only needed variables namely host, port, user, password.
|
||||
format!("postgresql://no_user:{password}@{host}:{port}")
|
||||
};
|
||||
conf.append("shared_preload_libraries", "neon");
|
||||
conf.append_line("");
|
||||
conf.append("neon.pageserver_connstring", &pageserver_connstr);
|
||||
if let AuthType::NeonJWT = auth_type {
|
||||
conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
|
||||
}
|
||||
conf.append("neon.tenant_id", &self.tenant_id.to_string());
|
||||
conf.append("neon.timeline_id", &self.timeline_id.to_string());
|
||||
if let Some(lsn) = self.lsn {
|
||||
@@ -426,8 +447,6 @@ impl PostgresNode {
|
||||
"DYLD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
|
||||
);
|
||||
|
||||
// Pass authentication token used for the connections to pageserver and safekeepers
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
@@ -477,6 +496,10 @@ impl PostgresNode {
|
||||
self.pg_ctl(&["start"], auth_token)
|
||||
}
|
||||
|
||||
pub fn restart(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
self.pg_ctl(&["restart"], auth_token)
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
// If we are going to destroy data directory,
|
||||
// use immediate shutdown mode, otherwise,
|
||||
@@ -507,4 +530,26 @@ impl PostgresNode {
|
||||
"postgres"
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims},
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
};
|
||||
|
||||
@@ -110,14 +110,15 @@ impl NeonBroker {
|
||||
pub struct PageServerConf {
|
||||
// node id
|
||||
pub id: NodeId,
|
||||
|
||||
// Pageserver connection settings
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
// used to determine which auth type is used
|
||||
pub auth_type: AuthType,
|
||||
|
||||
// jwt auth token used for communication with pageserver
|
||||
pub auth_token: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -126,8 +127,8 @@ impl Default for PageServerConf {
|
||||
id: NodeId(0),
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_token: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -400,33 +401,48 @@ impl LocalEnv {
|
||||
|
||||
fs::create_dir(base_path)?;
|
||||
|
||||
// Generate keypair for JWT.
|
||||
//
|
||||
// The keypair is only needed if authentication is enabled in any of the
|
||||
// components. For convenience, we generate the keypair even if authentication
|
||||
// is not enabled, so that you can easily enable it after the initialization
|
||||
// step. However, if the key generation fails, we treat it as non-fatal if
|
||||
// authentication was not enabled.
|
||||
// generate keys for jwt
|
||||
// openssl genrsa -out private_key.pem 2048
|
||||
let private_key_path;
|
||||
if self.private_key_path == PathBuf::new() {
|
||||
match generate_auth_keys(
|
||||
base_path.join("auth_private_key.pem").as_path(),
|
||||
base_path.join("auth_public_key.pem").as_path(),
|
||||
) {
|
||||
Ok(()) => {
|
||||
self.private_key_path = PathBuf::from("auth_private_key.pem");
|
||||
}
|
||||
Err(e) => {
|
||||
if !self.auth_keys_needed() {
|
||||
eprintln!("Could not generate keypair for JWT authentication: {e}");
|
||||
eprintln!("Continuing anyway because authentication was not enabled");
|
||||
self.private_key_path = PathBuf::from("auth_private_key.pem");
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
private_key_path = base_path.join("auth_private_key.pem");
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("genrsa")
|
||||
.args(["-out", private_key_path.to_str().unwrap()])
|
||||
.arg("2048")
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
self.private_key_path = PathBuf::from("auth_private_key.pem");
|
||||
|
||||
let public_key_path = base_path.join("auth_public_key.pem");
|
||||
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("rsa")
|
||||
.args(["-in", private_key_path.to_str().unwrap()])
|
||||
.arg("-pubout")
|
||||
.args(["-outform", "PEM"])
|
||||
.args(["-out", public_key_path.to_str().unwrap()])
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.pageserver.auth_token =
|
||||
self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
|
||||
fs::create_dir_all(self.pg_data_dirs_path())?;
|
||||
|
||||
for safekeeper in &self.safekeepers {
|
||||
@@ -435,12 +451,6 @@ impl LocalEnv {
|
||||
|
||||
self.persist_config(base_path)
|
||||
}
|
||||
|
||||
fn auth_keys_needed(&self) -> bool {
|
||||
self.pageserver.pg_auth_type == AuthType::NeonJWT
|
||||
|| self.pageserver.http_auth_type == AuthType::NeonJWT
|
||||
|| self.safekeepers.iter().any(|sk| sk.auth_enabled)
|
||||
}
|
||||
}
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
@@ -450,43 +460,6 @@ fn base_path() -> PathBuf {
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a public/private key pair for JWT authentication
|
||||
fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> {
|
||||
// Generate the key pair
|
||||
//
|
||||
// openssl genpkey -algorithm ed25519 -out auth_private_key.pem
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("genpkey")
|
||||
.args(["-algorithm", "ed25519"])
|
||||
.args(["-out", private_key_path.to_str().unwrap()])
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
// Extract the public key from the private key file
|
||||
//
|
||||
// openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("pkey")
|
||||
.args(["-in", private_key_path.to_str().unwrap()])
|
||||
.arg("-pubout")
|
||||
.args(["-out", public_key_path.to_str().unwrap()])
|
||||
.output()
|
||||
.context("failed to extract public key from private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -82,8 +82,15 @@ impl PageServerNode {
|
||||
let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
|
||||
.expect("Unable to parse listen_pg_addr");
|
||||
let port = port.unwrap_or(5432);
|
||||
let password = if env.pageserver.auth_type == AuthType::NeonJWT {
|
||||
Some(env.pageserver.auth_token.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port)
|
||||
.set_password(password),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
|
||||
@@ -99,32 +106,25 @@ impl PageServerNode {
|
||||
self.env.pg_distrib_dir_raw().display()
|
||||
);
|
||||
|
||||
let http_auth_type_param =
|
||||
format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
|
||||
let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
|
||||
let listen_http_addr_param = format!(
|
||||
"listen_http_addr='{}'",
|
||||
self.env.pageserver.listen_http_addr
|
||||
);
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
|
||||
let listen_pg_addr_param =
|
||||
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
let mut overrides = vec![
|
||||
id,
|
||||
pg_distrib_dir_param,
|
||||
http_auth_type_param,
|
||||
pg_auth_type_param,
|
||||
authg_type_param,
|
||||
listen_http_addr_param,
|
||||
listen_pg_addr_param,
|
||||
broker_endpoint_param,
|
||||
];
|
||||
|
||||
if self.env.pageserver.http_auth_type != AuthType::Trust
|
||||
|| self.env.pageserver.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
|
||||
}
|
||||
overrides
|
||||
@@ -247,10 +247,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
|
||||
// needs a token, and how to generate that token, seems independent to whether
|
||||
// the pageserver requires a token in incoming requests.
|
||||
Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
|
||||
Ok(if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
// Generate a token to connect from the pageserver to a safekeeper
|
||||
let token = self
|
||||
.env
|
||||
@@ -273,30 +270,27 @@ impl PageServerNode {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
|
||||
let mut config = self.pg_connection_config.clone();
|
||||
if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
config = config.set_password(Some(token));
|
||||
}
|
||||
Ok(config.connect_no_tls()?)
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
println!("Pageserver query: '{sql}'");
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
|
||||
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
|
||||
self.pg_connection_config.connect_no_tls()
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
let mut builder = self.http_client.request(method, url);
|
||||
if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
builder = builder.bearer_auth(token)
|
||||
if self.env.pageserver.auth_type == AuthType::NeonJWT {
|
||||
builder = builder.bearer_auth(&self.env.pageserver.auth_token)
|
||||
}
|
||||
Ok(builder)
|
||||
builder
|
||||
}
|
||||
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
|
||||
self.http_request(Method::GET, format!("{}/status", self.http_base_url))
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
@@ -304,7 +298,7 @@ impl PageServerNode {
|
||||
|
||||
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
|
||||
Ok(self
|
||||
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
|
||||
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
@@ -358,21 +352,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.get("eviction_policy")
|
||||
.map(|x| serde_json::from_str(x))
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
min_resident_size_override: settings
|
||||
.remove("min_resident_size_override")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'min_resident_size_override' as integer")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
|
||||
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&request)
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
@@ -389,7 +373,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: settings
|
||||
@@ -440,11 +424,6 @@ impl PageServerNode {
|
||||
.map(|x| serde_json::from_str(x))
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
min_resident_size_override: settings
|
||||
.get("min_resident_size_override")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'min_resident_size_override' as an integer")?,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
@@ -457,7 +436,7 @@ impl PageServerNode {
|
||||
.http_request(
|
||||
Method::GET,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
)
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?;
|
||||
@@ -476,7 +455,7 @@ impl PageServerNode {
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)?
|
||||
)
|
||||
.json(&TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
@@ -513,7 +492,7 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::sync::Arc;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -10,6 +11,7 @@ use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, SafekeeperConf},
|
||||
@@ -63,10 +65,14 @@ pub struct SafekeeperNode {
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(env));
|
||||
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
@@ -74,6 +80,7 @@ impl SafekeeperNode {
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
pageserver,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,10 +115,6 @@ impl SafekeeperNode {
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
let id_string = id.to_string();
|
||||
// TODO: add availability_zone to the config.
|
||||
// Right now we just specify any value here and use it to check metrics in tests.
|
||||
let availability_zone = format!("sk-{}", id_string);
|
||||
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
@@ -123,8 +126,6 @@ impl SafekeeperNode {
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
"--availability-zone",
|
||||
&availability_zone,
|
||||
];
|
||||
if !self.conf.sync {
|
||||
args.push("--no-sync");
|
||||
|
||||
@@ -160,7 +160,6 @@ services:
|
||||
build:
|
||||
context: ./compute_wrapper/
|
||||
args:
|
||||
- REPOSITORY=${REPOSITORY:-neondatabase}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
|
||||
- TAG=${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
|
||||
@@ -29,22 +29,15 @@ These components should not have access to the private key and may only get toke
|
||||
The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
|
||||
There is currently no way to rotate the key without bringing down all components.
|
||||
|
||||
### Best practices
|
||||
|
||||
See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725)
|
||||
|
||||
|
||||
### Token format
|
||||
|
||||
The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)).
|
||||
|
||||
Example:
|
||||
The JWT tokens in Neon use RSA as the algorithm. Example:
|
||||
|
||||
Header:
|
||||
|
||||
```
|
||||
{
|
||||
"alg": "EdDSA",
|
||||
"alg": "RS512", # RS256, RS384, or RS512
|
||||
"typ": "JWT"
|
||||
}
|
||||
```
|
||||
@@ -75,8 +68,8 @@ Currently also used for connection from any pageserver to any safekeeper.
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
```bash
|
||||
openssl genpkey -algorithm ed25519 -out auth_private_key.pem
|
||||
openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
|
||||
openssl genrsa -out auth_private_key.pem 2048
|
||||
openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem
|
||||
```
|
||||
|
||||
Configuration files for all components point to `public_key.pem` for JWT validation.
|
||||
@@ -106,22 +99,20 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo
|
||||
There is no administrative API except those provided by PostgreSQL.
|
||||
|
||||
#### Outgoing connections
|
||||
Compute connects to Pageserver for getting pages. The connection string is
|
||||
configured by the `neon.pageserver_connstring` PostgreSQL GUC,
|
||||
e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN`
|
||||
environment variable is set, it is used as the password for the connection. (The
|
||||
pageserver uses JWT tokens for authentication, so the password is really a
|
||||
token.)
|
||||
Compute connects to Pageserver for getting pages.
|
||||
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
|
||||
The environment variable inside the connection string is substituted with
|
||||
the JWT token.
|
||||
|
||||
Compute connects to Safekeepers to write and commit data. The list of safekeeper
|
||||
addresses is given in the `neon.safekeepers` GUC. The connections to the
|
||||
safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
|
||||
variable, if set.
|
||||
Compute connects to Safekeepers to write and commit data.
|
||||
The token is the same for all safekeepers.
|
||||
It's stored in an environment variable, whose name is configured
|
||||
by the `neon.safekeeper_token_env` PostgreSQL GUC.
|
||||
If the GUC is unset, no token is passed.
|
||||
|
||||
The `compute_ctl` binary that runs before the PostgreSQL server, and launches
|
||||
PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the
|
||||
initial "base backup" dump, to initialize the PostgreSQL data directory. It also
|
||||
uses `$NEON_AUTH_TOKEN` as the password for the connection.
|
||||
Note that both tokens can be (and typically are) the same;
|
||||
the scope is the tenant and the token is usually passed through the
|
||||
`$NEON_AUTH_TOKEN` environment variable.
|
||||
|
||||
### Pageserver
|
||||
#### Overview
|
||||
@@ -146,12 +137,10 @@ Each compute should present a token valid for the timeline's tenant.
|
||||
Pageserver also has HTTP API: some parts are per-tenant,
|
||||
some parts are server-wide, these are different scopes.
|
||||
|
||||
Authentication can be enabled separately for the HTTP mgmt API, and
|
||||
for the libpq connections from compute. The `http_auth_type` and
|
||||
`pg_auth_type` configuration variables in Pageserver's config may
|
||||
have one of these values:
|
||||
The `auth_type` configuration variable in Pageserver's config may have
|
||||
either of three values:
|
||||
|
||||
* `Trust` removes all authentication.
|
||||
* `Trust` removes all authentication. The outdated `MD5` value does likewise
|
||||
* `NeonJWT` enables JWT validation.
|
||||
Tokens are validated using the public key which lies in a PEM file
|
||||
specified in the `auth_validation_public_key_path` config.
|
||||
|
||||
@@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values.
|
||||
- PG_VERSION: postgres version for compute (default is 14)
|
||||
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
|
||||
```
|
||||
$ cd docker-compose/
|
||||
$ cd docker-compose/docker-compose.yml
|
||||
$ docker-compose down # remove the conainers if exists
|
||||
$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version
|
||||
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
|
||||
Creating network "dockercompose_default" with the default driver
|
||||
Creating docker-compose_storage_broker_1 ... done
|
||||
(...omit...)
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
# Deleting pageserver part of tenants data from s3
|
||||
|
||||
Created on 08.03.23
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
|
||||
|
||||
This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
|
||||
|
||||
## Summary
|
||||
|
||||
TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons.
|
||||
|
||||
The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision).
|
||||
|
||||
## Components
|
||||
|
||||
pageserver, control-plane
|
||||
|
||||
## Requirements
|
||||
|
||||
Deletion should successfully finish (eventually) without leaving dangling files in presense of:
|
||||
|
||||
- component restarts
|
||||
- component outage
|
||||
- pageserver loss
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files.
|
||||
|
||||
Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation.
|
||||
|
||||
The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called.
|
||||
|
||||
For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines)
|
||||
|
||||
### 1. Pageserver owns deletion machinery
|
||||
|
||||
#### The sequence
|
||||
|
||||
TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence.
|
||||
|
||||
Happy path:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>PS: Create deleted mark file locally
|
||||
PS->>CP: Accepted
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
loop Poll for status
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True or False
|
||||
end
|
||||
```
|
||||
|
||||
Why two mark files?
|
||||
Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach.
|
||||
|
||||
Why local mark file is needed?
|
||||
|
||||
If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
|
||||
|
||||
If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
|
||||
|
||||
If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
|
||||
|
||||
Thus we need local record of tenant being deleted as well.
|
||||
|
||||
##### Handle pageserver crashes
|
||||
|
||||
Lets explore sequences with various crash points.
|
||||
|
||||
Pageserver crashes before `deleted` mark file is persisted in s3:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
note over PS: Crash point 1.
|
||||
CP->>PS: Retry delete request
|
||||
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>PS: Create deleted mark file locally
|
||||
|
||||
PS->>CP: Accepted
|
||||
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True
|
||||
```
|
||||
|
||||
Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
|
||||
note over PS: Crash point 2.
|
||||
note over PS: During startup we reconcile <br> with remote and see <br> whether the remote mark exists
|
||||
alt Remote mark exists
|
||||
PS->>PS: create local mark if its missing
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
end
|
||||
|
||||
note over CP: Eventually console should <br> retry delete request
|
||||
|
||||
CP->>PS: Retry delete tenant
|
||||
PS->>CP: Not modified
|
||||
else Mark is missing
|
||||
note over PS: Continue to operate the tenant as if deletion didnt happen
|
||||
|
||||
note over CP: Eventually console should <br> retry delete request
|
||||
|
||||
CP->>PS: Retry delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>CP: Delete tenant
|
||||
end
|
||||
|
||||
PS->>PS: Continue with layer file deletions
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True
|
||||
```
|
||||
|
||||
Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
|
||||
|
||||
If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
|
||||
|
||||
The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404.
|
||||
|
||||
##### Differences between tenants and timelines
|
||||
|
||||
For timeline the sequence is the same with the following differences:
|
||||
|
||||
- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json
|
||||
- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible
|
||||
|
||||
##### Handle pageserver loss
|
||||
|
||||
If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created).
|
||||
|
||||
##### Restrictions for tenant that is in progress of being deleted
|
||||
|
||||
I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
|
||||
- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working.
|
||||
- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck.
|
||||
- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract.
|
||||
- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See
|
||||
|
||||
Cons:
|
||||
|
||||
- Logic is a tricky, needs good testing
|
||||
- Anything else?
|
||||
|
||||
### 2. Control plane owns deletion machinery
|
||||
|
||||
In this case the only action performed on pageserver is removal of local files.
|
||||
|
||||
Everything else is done by control plane. The steps are as follows:
|
||||
|
||||
1. Control plane marks tenant as "delete pending" in its database
|
||||
2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind
|
||||
3. When no files are left marks deletion as completed
|
||||
|
||||
In case of restart it selects all tenants marked as "delete pending" and continues the deletion.
|
||||
|
||||
For tenants it is simple. For timelines there are caveats.
|
||||
|
||||
Assume that the same workflow is used for timelines.
|
||||
|
||||
If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state.
|
||||
|
||||
Available options:
|
||||
|
||||
- require list of alive timelines to be passed to attach call
|
||||
- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks.
|
||||
|
||||
With first option the following problem becomes apparent:
|
||||
|
||||
Who is the source of truth regarding timeline liveness?
|
||||
|
||||
Imagine:
|
||||
PS1 fails.
|
||||
PS2 gets assigned the tenant.
|
||||
New branch gets created
|
||||
PS1 starts up (is it possible or we just recycle it?)
|
||||
PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
|
||||
|
||||
So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
|
||||
|
||||
### Summary
|
||||
|
||||
Cons:
|
||||
|
||||
- Potential thundering herd-like problem during storage restart (requests to control plane)
|
||||
- Potential increase in storage startup time (additional request to control plane)
|
||||
- Storage startup starts to depend on console
|
||||
- Erroneous attach call can attach tenant in half deleted state
|
||||
|
||||
Pros:
|
||||
|
||||
- Easier to reason about if you dont have to account for pageserver restarts
|
||||
|
||||
### Extra notes
|
||||
|
||||
There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at.
|
||||
|
||||
Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary.
|
||||
|
||||
## Decision
|
||||
|
||||
After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
|
||||
|
||||
To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
|
||||
|
||||
With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
|
||||
|
||||
So the decision is to proceed with pageserver centric approach.
|
||||
@@ -115,12 +115,6 @@ pub struct TenantCreateRequest {
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
// We defer the parsing of the eviction_policy field to the request handler.
|
||||
// Otherwise we'd have to move the types for eviction policy into this package.
|
||||
// We might do that once the eviction feature has stabilizied.
|
||||
// For now, this field is not even documented in the openapi_spec.yml.
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -166,7 +160,6 @@ pub struct TenantConfigRequest {
|
||||
// We might do that once the eviction feature has stabilizied.
|
||||
// For now, this field is not even documented in the openapi_spec.yml.
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
@@ -187,7 +180,6 @@ impl TenantConfigRequest {
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: None,
|
||||
min_resident_size_override: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -349,7 +341,7 @@ pub enum InMemoryLayerInfo {
|
||||
pub enum HistoricLayerInfo {
|
||||
Delta {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
layer_file_size: Option<u64>,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
@@ -360,7 +352,7 @@ pub enum HistoricLayerInfo {
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
layer_file_size: Option<u64>,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
|
||||
@@ -767,7 +767,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||
Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)),
|
||||
// Note: CopyFail in duplex copy is somewhat unexpected (at least to
|
||||
// PG walsender; evidently and per my docs reading client should
|
||||
// finish it with CopyDone). It is not a problem to recover from it
|
||||
|
||||
@@ -19,7 +19,6 @@ jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
regex.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
//
|
||||
// TODO: use ed25519 keys
|
||||
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
|
||||
|
||||
use serde;
|
||||
use std::fs;
|
||||
@@ -6,15 +9,26 @@ use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use jsonwebtoken::{
|
||||
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
|
||||
decode, encode, Algorithm, Algorithm::*, DecodingKey, EncodingKey, Header, TokenData,
|
||||
Validation,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::id::TenantId;
|
||||
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
/// Algorithms accepted during validation.
|
||||
///
|
||||
/// Accept all RSA-based algorithms. We pass this list to jsonwebtoken::decode,
|
||||
/// which checks that the algorithm in the token is one of these.
|
||||
///
|
||||
/// XXX: It also fails the validation if there are any algorithms in this list that belong
|
||||
/// to different family than the token's algorithm. In other words, we can *not* list any
|
||||
/// non-RSA algorithms here, or the validation always fails with InvalidAlgorithm error.
|
||||
const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512];
|
||||
|
||||
/// Algorithm to use when generating a new token in [`encode_from_key_file`]
|
||||
const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
@@ -55,7 +69,7 @@ pub struct JwtAuth {
|
||||
impl JwtAuth {
|
||||
pub fn new(decoding_key: DecodingKey) -> Self {
|
||||
let mut validation = Validation::default();
|
||||
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
|
||||
validation.algorithms = ACCEPTED_ALGORITHMS.into();
|
||||
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
||||
// expiration.
|
||||
validation.required_spec_claims = [].into();
|
||||
@@ -67,7 +81,7 @@ impl JwtAuth {
|
||||
|
||||
pub fn from_key_path(key_path: &Path) -> Result<Self> {
|
||||
let public_key = fs::read(key_path)?;
|
||||
Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
|
||||
Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
|
||||
}
|
||||
|
||||
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
|
||||
@@ -85,8 +99,8 @@ impl std::fmt::Debug for JwtAuth {
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
|
||||
let key = EncodingKey::from_ed_pem(key_data)?;
|
||||
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
|
||||
let key = EncodingKey::from_rsa_pem(key_data)?;
|
||||
Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -94,19 +108,49 @@ mod tests {
|
||||
use super::*;
|
||||
use std::str::FromStr;
|
||||
|
||||
// Generated with:
|
||||
// generated with:
|
||||
//
|
||||
// openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
|
||||
// openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
|
||||
const TEST_PUB_KEY_ED25519: &[u8] = br#"
|
||||
// openssl genpkey -algorithm rsa -out storage-auth-priv.pem
|
||||
// openssl pkey -in storage-auth-priv.pem -pubout -out storage-auth-pub.pem
|
||||
const TEST_PUB_KEY_RSA: &[u8] = br#"
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
|
||||
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy6OZ+/kQXcueVJA/KTzO
|
||||
v4ljxylc/Kcb0sXWuXg1GB8k3nDA1gK66LFYToH0aTnqrnqG32Vu6wrhwuvqsZA7
|
||||
jQvP0ZePAbWhpEqho7EpNunDPcxZ/XDy5TQlB1P58F9I3lkJXDC+DsHYLuuzwhAv
|
||||
vo2MtWRdYlVHblCVLyZtANHhUMp2HUhgjHnJh5UrLIKOl4doCBxkM3rK0wjKsNCt
|
||||
M92PCR6S9rvYzldfeAYFNppBkEQrXt2CgUqZ4KaS4LXtjTRUJxljijA4HWffhxsr
|
||||
euRu3ufq8kVqie7fum0rdZZSkONmce0V0LesQ4aE2jB+2Sn48h6jb4dLXGWdq8TV
|
||||
wQIDAQAB
|
||||
-----END PUBLIC KEY-----
|
||||
"#;
|
||||
|
||||
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
|
||||
const TEST_PRIV_KEY_RSA: &[u8] = br#"
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDLo5n7+RBdy55U
|
||||
kD8pPM6/iWPHKVz8pxvSxda5eDUYHyTecMDWArrosVhOgfRpOequeobfZW7rCuHC
|
||||
6+qxkDuNC8/Rl48BtaGkSqGjsSk26cM9zFn9cPLlNCUHU/nwX0jeWQlcML4Owdgu
|
||||
67PCEC++jYy1ZF1iVUduUJUvJm0A0eFQynYdSGCMecmHlSssgo6Xh2gIHGQzesrT
|
||||
CMqw0K0z3Y8JHpL2u9jOV194BgU2mkGQRCte3YKBSpngppLgte2NNFQnGWOKMDgd
|
||||
Z9+HGyt65G7e5+ryRWqJ7t+6bSt1llKQ42Zx7RXQt6xDhoTaMH7ZKfjyHqNvh0tc
|
||||
ZZ2rxNXBAgMBAAECggEAVz3u4Wlx3o02dsoZlSQs+xf0PEX3RXKeU+1YMbtTG9Nz
|
||||
6yxpIQaoZrpbt76rJE2gwkFR+PEu1NmjoOuLb6j4KlQuI4AHz1auOoGSwFtM6e66
|
||||
K4aZ4x95oEJ3vqz2fkmEIWYJwYpMUmwvnuJx76kZm0xvROMLsu4QHS2+zCVtO5Tr
|
||||
hvS05IMVuZ2TdQBZw0+JaFdwXbgDjQnQGY5n9MoTWSx1a4s/FF4Eby65BbDutcpn
|
||||
Vt3jQAOmO1X2kbPeWSGuPJRzyUs7Kg8qfeglBIR3ppGP3vPYAdWX+ho00bmsVkSp
|
||||
Q8vjul6C3WiM+kjwDxotHSDgbl/xldAl7OqPh0bfAQKBgQDnycXuq14Vg8nZvyn9
|
||||
rTnvucO8RBz5P6G+FZ+44cAS2x79+85onARmMnm+9MKYLSMo8fOvsK034NDI68XM
|
||||
04QQ/vlfouvFklMTGJIurgEImTZbGCmlMYCvFyIxaEWixon8OpeI4rFe4Hmbiijh
|
||||
PxhxWg221AwvBS2sco8J/ylEkQKBgQDg6Rh2QYb/j0Wou1rJPbuy3NhHofd5Rq35
|
||||
4YV3f2lfVYcPrgRhwe3T9SVII7Dx8LfwzsX5TAlf48ESlI3Dzv40uOCDM+xdtBRI
|
||||
r96SfSm+jup6gsXU3AsdNkrRK3HoOG9Z/TkrUp213QAIlVnvIx65l4ckFMlpnPJ0
|
||||
lo1LDXZWMQKBgFArzjZ7N5OhfdO+9zszC3MLgdRAivT7OWqR+CjujIz5FYMr8Xzl
|
||||
WfAvTUTrS9Nu6VZkObFvHrrRG+YjBsuN7YQjbQXTSFGSBwH34bgbn2fl9pMTjHQC
|
||||
50uoaL9GHa/rlBaV/YvvPQJgCi/uXa1rMX0jdNLkDULGO8IF7cu7Yf7BAoGBAIUU
|
||||
J29BkpmAst0GDs/ogTlyR18LTR0rXyHt+UUd1MGeH859TwZw80JpWWf4BmkB4DTS
|
||||
hH3gKePdJY7S65ci0XNsuRupC4DeXuorde0DtkGU2tUmr9wlX0Ynq9lcdYfMbMa4
|
||||
eK1TsxG69JwfkxlWlIWITWRiEFM3lJa7xlrUWmLhAoGAFpKWF/hn4zYg3seU9gai
|
||||
EYHKSbhxA4mRb+F0/9IlCBPMCqFrL5yftUsYIh2XFKn8+QhO97Nmk8wJSK6TzQ5t
|
||||
ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
|
||||
8ismApXVGHpOCstzikV9W7k=
|
||||
-----END PRIVATE KEY-----
|
||||
"#;
|
||||
|
||||
@@ -117,7 +161,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
// A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
|
||||
// Here are tokens containing the following payload, signed using TEST_PRIV_KEY_RSA
|
||||
// using RS512, RS384 and RS256 algorithms:
|
||||
//
|
||||
// ```
|
||||
// {
|
||||
@@ -129,13 +174,21 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
// }
|
||||
// ```
|
||||
//
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
||||
// These were encoded with the online debugger at https://jwt.io
|
||||
//
|
||||
let encoded_rs512 = "eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.QmqfteDQmDGoxQ5EFkasbt35Lx0W0Nh63muQnYZvFq93DSh4ZbOG9Mc4yaiXZoiS5HgeKtFKv3mbWkDqjz3En06aY17hWwguBtAsGASX48lYeCPADYGlGAuaWnOnVRwe3iiOC7tvPFvwX_45S84X73sNUXyUiXv6nLdcDqVXudtNrGST_DnZDnjuUJX11w7sebtKqQQ8l9-iGHiXOl5yevpMCoB1OcTWcT6DfDtffoNuMHDC3fyhmEGG5oKAt1qBybqAIiyC9-UBAowRZXhdfxrzUl-I9jzKWvk85c5ulhVRwbPeP6TTTlPKwFzBNHg1i2U-1GONew5osQ3aoptwsA";
|
||||
|
||||
// Check it can be validated with the public key
|
||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
||||
let claims_from_token = auth.decode(encoded_eddsa)?.claims;
|
||||
assert_eq!(claims_from_token, expected_claims);
|
||||
let encoded_rs384 = "eyJhbGciOiJSUzM4NCIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.qqk4nkxKzOJP38c_g57_w_SfdQVmCsDT_bsLmdFj_N6LIB22gr6U6_P_5mvk3pIAsp0VCTDwPrCU908TxqjibEkwvQoJwbogHamSGHpD7eJBxGblSnA-Nr3MlEMxpFtec8QokSm6C5mH7DoBYjB2xzeOlxAmpR2GAzInKiMkU4kZ_OcqqrmVcMXY_6VnbxZWMekuw56zE1-PP_qNF1HvYOH-P08ONP8qdo5UPtBG7QBEFlCqZXJZCFihQaI4Vzil9rDuZGCm3I7xQJ8-yh1PX3BTbGo8EzqLdRyBeTpr08UTuRbp_MJDWevHpP3afvJetAItqZXIoZQrbJjcByHqKw";
|
||||
|
||||
let encoded_rs256 = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.dF2N9KXG8ftFKHYbd5jQtXMQqv0Ej8FISGp1b_dmqOCotXj5S1y2AWjwyB_EXHM77JXfbEoJPAPrFFBNfd8cWtkCSTvpxWoHaecGzegDFGv5ZSc5AECFV1Daahc3PI3jii9wEiGkFOiwiBNfZ5INomOAsV--XXxlqIwKbTcgSYI7lrOTfecXAbAHiMKQlQYiIBSGnytRCgafhRkyGzPAL8ismthFJ9RHfeejyskht-9GbVHURw02bUyijuHEulpf9eEY3ZiB28de6jnCdU7ftIYaUMaYWt0nZQGkzxKPSfSLZNy14DTOYLDS04DVstWQPqnCUW_ojg0wJETOOfo9Zw";
|
||||
|
||||
// Check that RS512, RS384 and RS256 tokens can all be validated
|
||||
let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
|
||||
|
||||
for encoded in [encoded_rs512, encoded_rs384, encoded_rs256] {
|
||||
let claims_from_token = auth.decode(encoded)?.claims;
|
||||
assert_eq!(claims_from_token, expected_claims);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -146,10 +199,10 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_RSA)?;
|
||||
|
||||
// decode it back
|
||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
||||
let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
|
||||
let decoded = auth.decode(&encoded)?;
|
||||
|
||||
assert_eq!(decoded.claims, claims);
|
||||
|
||||
@@ -23,7 +23,7 @@ pub enum IdError {
|
||||
struct Id([u8; 16]);
|
||||
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
Id::from(arr)
|
||||
@@ -112,7 +112,7 @@ impl fmt::Debug for Id {
|
||||
macro_rules! id_newtype {
|
||||
($t:ident) => {
|
||||
impl $t {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t {
|
||||
$t(Id::get_from_buf(buf))
|
||||
}
|
||||
|
||||
|
||||
@@ -51,9 +51,6 @@ pub mod history_buffer;
|
||||
|
||||
pub mod measured_stream;
|
||||
|
||||
pub mod serde_percent;
|
||||
pub mod serde_regex;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
//! A serde::Deserialize type for percentages.
|
||||
//!
|
||||
//! See [`Percent`] for details.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// If the value is not an integer between 0 and 100,
|
||||
/// deserialization fails with a descriptive error.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
|
||||
|
||||
impl Percent {
|
||||
pub fn get(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u8, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let v: u8 = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
if v > 100 {
|
||||
return Err(serde::de::Error::custom(
|
||||
"must be an integer between 0 and 100",
|
||||
));
|
||||
}
|
||||
Ok(v)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Percent;
|
||||
|
||||
#[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)]
|
||||
struct Foo {
|
||||
bar: Percent,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basics() {
|
||||
let input = r#"{ "bar": 50 }"#;
|
||||
let foo: Foo = serde_json::from_str(input).unwrap();
|
||||
assert_eq!(foo.bar.get(), 50);
|
||||
}
|
||||
#[test]
|
||||
fn null_handling() {
|
||||
let input = r#"{ "bar": null }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn zero() {
|
||||
let input = r#"{ "bar": 0 }"#;
|
||||
let foo: Foo = serde_json::from_str(input).unwrap();
|
||||
assert_eq!(foo.bar.get(), 0);
|
||||
}
|
||||
#[test]
|
||||
fn out_of_range_above() {
|
||||
let input = r#"{ "bar": 101 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn out_of_range_below() {
|
||||
let input = r#"{ "bar": -1 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn float() {
|
||||
let input = r#"{ "bar": 50.5 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn string() {
|
||||
let input = r#"{ "bar": "50 %" }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
}
|
||||
@@ -1,60 +0,0 @@
|
||||
//! A `serde::{Deserialize,Serialize}` type for regexes.
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Regex(
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_regex",
|
||||
serialize_with = "serialize_regex"
|
||||
)]
|
||||
regex::Regex,
|
||||
);
|
||||
|
||||
fn deserialize_regex<'de, D>(deserializer: D) -> Result<regex::Regex, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?;
|
||||
Ok(re)
|
||||
}
|
||||
|
||||
fn serialize_regex<S>(re: ®ex::Regex, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
{
|
||||
serializer.collect_str(re.as_str())
|
||||
}
|
||||
|
||||
impl Deref for Regex {
|
||||
type Target = regex::Regex;
|
||||
|
||||
fn deref(&self) -> ®ex::Regex {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Regex {
|
||||
fn eq(&self, other: &Regex) -> bool {
|
||||
// comparing the automatons would be quite complicated
|
||||
self.as_str() == other.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Regex {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
#[test]
|
||||
fn roundtrip() {
|
||||
let input = r#""foo.*bar""#;
|
||||
let re: super::Regex = serde_json::from_str(input).unwrap();
|
||||
assert!(re.is_match("foo123bar"));
|
||||
assert!(!re.is_match("foo"));
|
||||
let output = serde_json::to_string(&re).unwrap();
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
}
|
||||
@@ -48,7 +48,6 @@ serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
|
||||
@@ -8,7 +8,6 @@ use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
@@ -271,31 +270,15 @@ fn start_pageserver(
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
|
||||
|
||||
// Initialize authentication for incoming connections
|
||||
let http_auth;
|
||||
let pg_auth;
|
||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||
info!(
|
||||
"Loading public key for verifying JWT tokens from {:#?}",
|
||||
key_path
|
||||
);
|
||||
let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
|
||||
|
||||
http_auth = match &conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => Some(auth.clone()),
|
||||
};
|
||||
pg_auth = match &conf.pg_auth_type {
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => Some(auth),
|
||||
};
|
||||
} else {
|
||||
http_auth = None;
|
||||
pg_auth = None;
|
||||
}
|
||||
info!("Using auth for http API: {:#?}", conf.http_auth_type);
|
||||
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => {
|
||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||
Some(JwtAuth::from_key_path(key_path)?.into())
|
||||
}
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
@@ -320,34 +303,14 @@ fn start_pageserver(
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
// is still accessible even if background task is not configured as long as remote storage has
|
||||
// been configured.
|
||||
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
||||
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
launch_disk_usage_global_eviction_task(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router = http::make_router(
|
||||
conf,
|
||||
launch_ts,
|
||||
http_auth,
|
||||
remote_storage,
|
||||
disk_usage_eviction_state,
|
||||
)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router = http::make_router(conf, launch_ts, auth.clone(), remote_storage)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
@@ -419,9 +382,9 @@ fn start_pageserver(
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
conf,
|
||||
pg_auth,
|
||||
auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
conf.auth_type,
|
||||
libpq_ctx,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -27,7 +27,6 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -62,7 +61,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
@@ -91,10 +89,6 @@ pub mod defaults {
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
|
||||
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -107,8 +101,6 @@ pub mod defaults {
|
||||
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
|
||||
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
|
||||
|
||||
#min_resident_size_override = .. # in bytes
|
||||
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
@@ -126,9 +118,6 @@ pub struct PageServerConf {
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
|
||||
/// Current availability zone. Used for traffic metrics.
|
||||
pub availability_zone: Option<String>,
|
||||
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
pub wait_lsn_timeout: Duration,
|
||||
// How long to wait for WAL redo to complete.
|
||||
@@ -149,15 +138,9 @@ pub struct PageServerConf {
|
||||
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Authentication
|
||||
/// authentication method for the HTTP mgmt API
|
||||
pub http_auth_type: AuthType,
|
||||
/// authentication method for libpq connections from compute
|
||||
pub pg_auth_type: AuthType,
|
||||
/// Path to a file containing public key for verifying JWT tokens.
|
||||
/// Used for both mgmt and compute auth, if enabled.
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub auth_type: AuthType,
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
pub default_tenant_conf: TenantConf,
|
||||
@@ -178,11 +161,6 @@ pub struct PageServerConf {
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
// See the corresponding metric's help string.
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
|
||||
pub test_remote_failures: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
@@ -218,8 +196,6 @@ struct PageServerConfigBuilder {
|
||||
|
||||
listen_http_addr: BuilderValue<String>,
|
||||
|
||||
availability_zone: BuilderValue<Option<String>>,
|
||||
|
||||
wait_lsn_timeout: BuilderValue<Duration>,
|
||||
wal_redo_timeout: BuilderValue<Duration>,
|
||||
|
||||
@@ -232,8 +208,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
pg_distrib_dir: BuilderValue<PathBuf>,
|
||||
|
||||
http_auth_type: BuilderValue<AuthType>,
|
||||
pg_auth_type: BuilderValue<AuthType>,
|
||||
auth_type: BuilderValue<AuthType>,
|
||||
|
||||
//
|
||||
auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
|
||||
@@ -253,10 +228,6 @@ struct PageServerConfigBuilder {
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
|
||||
evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
|
||||
|
||||
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
|
||||
|
||||
test_remote_failures: BuilderValue<u64>,
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
||||
@@ -269,7 +240,6 @@ impl Default for PageServerConfigBuilder {
|
||||
Self {
|
||||
listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
availability_zone: Set(None),
|
||||
wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
@@ -281,8 +251,7 @@ impl Default for PageServerConfigBuilder {
|
||||
pg_distrib_dir: Set(env::current_dir()
|
||||
.expect("cannot access current directory")
|
||||
.join("pg_install")),
|
||||
http_auth_type: Set(AuthType::Trust),
|
||||
pg_auth_type: Set(AuthType::Trust),
|
||||
auth_type: Set(AuthType::Trust),
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
@@ -310,13 +279,6 @@ impl Default for PageServerConfigBuilder {
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration(
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
|
||||
|
||||
disk_usage_based_eviction: Set(None),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: Set(false),
|
||||
@@ -333,10 +295,6 @@ impl PageServerConfigBuilder {
|
||||
self.listen_http_addr = BuilderValue::Set(listen_http_addr)
|
||||
}
|
||||
|
||||
pub fn availability_zone(&mut self, availability_zone: Option<String>) {
|
||||
self.availability_zone = BuilderValue::Set(availability_zone)
|
||||
}
|
||||
|
||||
pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
|
||||
self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
|
||||
}
|
||||
@@ -365,12 +323,8 @@ impl PageServerConfigBuilder {
|
||||
self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
|
||||
}
|
||||
|
||||
pub fn http_auth_type(&mut self, auth_type: AuthType) {
|
||||
self.http_auth_type = BuilderValue::Set(auth_type)
|
||||
}
|
||||
|
||||
pub fn pg_auth_type(&mut self, auth_type: AuthType) {
|
||||
self.pg_auth_type = BuilderValue::Set(auth_type)
|
||||
pub fn auth_type(&mut self, auth_type: AuthType) {
|
||||
self.auth_type = BuilderValue::Set(auth_type)
|
||||
}
|
||||
|
||||
pub fn auth_validation_public_key_path(
|
||||
@@ -432,14 +386,6 @@ impl PageServerConfigBuilder {
|
||||
self.test_remote_failures = BuilderValue::Set(fail_first);
|
||||
}
|
||||
|
||||
pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) {
|
||||
self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
|
||||
self.disk_usage_based_eviction = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn ondemand_download_behavior_treat_error_as_warn(
|
||||
&mut self,
|
||||
ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
@@ -456,9 +402,6 @@ impl PageServerConfigBuilder {
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
availability_zone: self
|
||||
.availability_zone
|
||||
.ok_or(anyhow!("missing availability_zone"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
@@ -476,10 +419,7 @@ impl PageServerConfigBuilder {
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
http_auth_type: self
|
||||
.http_auth_type
|
||||
.ok_or(anyhow!("missing http_auth_type"))?,
|
||||
pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
@@ -513,14 +453,6 @@ impl PageServerConfigBuilder {
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
evictions_low_residence_duration_metric_threshold: self
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.ok_or(anyhow!(
|
||||
"missing evictions_low_residence_duration_metric_threshold"
|
||||
))?,
|
||||
disk_usage_based_eviction: self
|
||||
.disk_usage_based_eviction
|
||||
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
@@ -667,7 +599,6 @@ impl PageServerConf {
|
||||
match key {
|
||||
"listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
|
||||
"listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
|
||||
"availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
|
||||
"wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
|
||||
"wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
|
||||
"initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
|
||||
@@ -681,8 +612,7 @@ impl PageServerConf {
|
||||
"auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
|
||||
PathBuf::from(parse_toml_string(key, item)?),
|
||||
)),
|
||||
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
|
||||
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
@@ -710,13 +640,6 @@ impl PageServerConf {
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
"evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
|
||||
"disk_usage_based_eviction" => {
|
||||
tracing::info!("disk_usage_based_eviction: {:#?}", &item);
|
||||
builder.disk_usage_based_eviction(
|
||||
toml_edit::de::from_item(item.clone())
|
||||
.context("parse disk_usage_based_eviction")?)
|
||||
},
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
@@ -724,7 +647,7 @@ impl PageServerConf {
|
||||
|
||||
let mut conf = builder.build().context("invalid config")?;
|
||||
|
||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||
if conf.auth_type == AuthType::NeonJWT {
|
||||
let auth_validation_public_key_path = conf
|
||||
.auth_validation_public_key_path
|
||||
.get_or_insert_with(|| workdir.join("auth_public_key.pem"));
|
||||
@@ -821,13 +744,6 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(item) = item.get("min_resident_size_override") {
|
||||
t_conf.min_resident_size_override = Some(
|
||||
toml_edit::de::from_item(item.clone())
|
||||
.context("parse min_resident_size_override")?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
@@ -847,12 +763,10 @@ impl PageServerConf {
|
||||
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
availability_zone: None,
|
||||
superuser: "cloud_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type: AuthType::Trust,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
@@ -864,11 +778,6 @@ impl PageServerConf {
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.unwrap(),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
}
|
||||
@@ -1010,9 +919,6 @@ metric_collection_interval = '222 s'
|
||||
cached_metric_collection_interval = '22200 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
|
||||
evictions_low_residence_duration_metric_threshold = '444 s'
|
||||
|
||||
log_format = 'json'
|
||||
|
||||
"#;
|
||||
@@ -1038,7 +944,6 @@ log_format = 'json'
|
||||
id: NodeId(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
availability_zone: None,
|
||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||
wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
|
||||
superuser: defaults::DEFAULT_SUPERUSER.to_string(),
|
||||
@@ -1046,8 +951,7 @@ log_format = 'json'
|
||||
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
workdir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type: AuthType::Trust,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
@@ -1067,10 +971,6 @@ log_format = 'json'
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||
)?,
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
|
||||
)?,
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
},
|
||||
@@ -1101,7 +1001,6 @@ log_format = 'json'
|
||||
id: NodeId(10),
|
||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
availability_zone: None,
|
||||
wait_lsn_timeout: Duration::from_secs(111),
|
||||
wal_redo_timeout: Duration::from_secs(111),
|
||||
superuser: "zzzz".to_string(),
|
||||
@@ -1109,8 +1008,7 @@ log_format = 'json'
|
||||
max_file_descriptors: 333,
|
||||
workdir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type: AuthType::Trust,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
@@ -1122,8 +1020,6 @@ log_format = 'json'
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
},
|
||||
|
||||
@@ -1,679 +0,0 @@
|
||||
//! This module implements the pageserver-global disk-usage-based layer eviction task.
|
||||
//!
|
||||
//! # Mechanics
|
||||
//!
|
||||
//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
|
||||
//! loop that evicts layers in response to a shortage of available bytes
|
||||
//! in the $repo/tenants directory's filesystem.
|
||||
//!
|
||||
//! The loop runs periodically at a configurable `period`.
|
||||
//!
|
||||
//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
|
||||
//! It compares the returned usage data against two different types of thresholds.
|
||||
//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
|
||||
//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
|
||||
//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
|
||||
//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
|
||||
//!
|
||||
//! # Eviction Policy
|
||||
//!
|
||||
//! There are two thresholds:
|
||||
//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
|
||||
//! If the actual usage is higher, the threshold is exceeded.
|
||||
//! `min_avail_bytes` is the absolute available space in bytes.
|
||||
//! If the actual usage is lower, the threshold is exceeded.
|
||||
//!
|
||||
//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
|
||||
//! The reservation is to keep the most recently accessed X bytes per tenant resident.
|
||||
//! If we cannot relieve pressure by evicting layers outside of the reservation, we
|
||||
//! start evicting layers that are part of the reservation, LRU first.
|
||||
//!
|
||||
//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
|
||||
//! throughout the code, but, no actual variable carries that name.
|
||||
//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
|
||||
//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
|
||||
//! during page reconstruction.
|
||||
//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
|
||||
//! Lastly, each tenant can have an override in their respectice tenant config (`min_resident_size_override`).
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, Timeline},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct State {
|
||||
/// Exclude http requests and background task from running at the same time.
|
||||
mutex: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub fn launch_disk_usage_global_eviction_task(
|
||||
conf: &'static PageServerConf,
|
||||
storage: GenericRemoteStorage,
|
||||
state: Arc<State>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
info!("launching disk usage based eviction task");
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
"disk usage based eviction",
|
||||
false,
|
||||
async move {
|
||||
disk_usage_eviction_task(
|
||||
&state,
|
||||
task_config,
|
||||
storage,
|
||||
&conf.tenants_path(),
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await;
|
||||
info!("disk usage based eviction task finishing");
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn disk_usage_eviction_task(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: GenericRemoteStorage,
|
||||
tenants_dir: &Path,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
if random_init_delay(task_config.period, &cancel)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
info!("shutting down");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let mut iteration_no = 0;
|
||||
loop {
|
||||
iteration_no += 1;
|
||||
let start = Instant::now();
|
||||
|
||||
async {
|
||||
let res = disk_usage_eviction_task_iteration(
|
||||
state,
|
||||
task_config,
|
||||
&storage,
|
||||
tenants_dir,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
// these stat failures are expected to be very rare
|
||||
warn!("iteration failed, unexpected error: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("iteration", iteration_no))
|
||||
.await;
|
||||
|
||||
let sleep_until = start + task_config.period;
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(sleep_until) => {},
|
||||
_ = cancel.cancelled() => {
|
||||
info!("shutting down");
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
fn has_pressure(&self) -> bool;
|
||||
fn add_available_bytes(&mut self, bytes: u64);
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_task_iteration(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenants_dir: &Path,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
match outcome {
|
||||
IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
|
||||
// nothing to do, select statement below will handle things
|
||||
}
|
||||
IterationOutcome::Finished(outcome) => {
|
||||
// Verify with statvfs whether we made any real progress
|
||||
let after = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
||||
.context("get filesystem-level disk usage after evictions")?;
|
||||
|
||||
debug!(?after, "disk usage");
|
||||
|
||||
if after.has_pressure() {
|
||||
// Don't bother doing an out-of-order iteration here now.
|
||||
// In practice, the task period is set to a value in the tens-of-seconds range,
|
||||
// which will cause another iteration to happen soon enough.
|
||||
// TODO: deltas between the three different usages would be helpful,
|
||||
// consider MiB, GiB, TiB
|
||||
warn!(?outcome, ?after, "disk usage still high");
|
||||
} else {
|
||||
info!(?outcome, ?after, "disk usage pressure relieved");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("disk_usage_eviction_iteration failed: {:#}", e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
pub enum IterationOutcome<U> {
|
||||
NoPressure,
|
||||
Cancelled,
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
/// The actual usage observed before we started the iteration.
|
||||
before: U,
|
||||
/// The expected value for `after`, according to internal accounting, after phase 1.
|
||||
planned: PlannedUsage<U>,
|
||||
/// The outcome of phase 2, where we actually do the evictions.
|
||||
///
|
||||
/// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
|
||||
/// be the same as `planned`.
|
||||
assumed: AssumedUsage<U>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct AssumedUsage<U> {
|
||||
/// The expected value for `after`, after phase 2.
|
||||
projected_after: U,
|
||||
/// The layers we failed to evict during phase 2.
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
respecting_tenant_min_resident_size: U,
|
||||
fallback_to_global_lru: Option<U>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
struct LayerCount {
|
||||
file_sizes: u64,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
let _g = state
|
||||
.mutex
|
||||
.try_lock()
|
||||
.map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
|
||||
|
||||
debug!(?usage_pre, "disk usage");
|
||||
|
||||
if !usage_pre.has_pressure() {
|
||||
return Ok(IterationOutcome::NoPressure);
|
||||
}
|
||||
|
||||
warn!(
|
||||
?usage_pre,
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
EvictionCandidates::Finished(partitioned) => partitioned,
|
||||
};
|
||||
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
candidate.layer.file_size(),
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
candidate.layer.get_tenant_id(),
|
||||
candidate.layer.get_timeline_id(),
|
||||
candidate.layer.filename().file_name(),
|
||||
);
|
||||
}
|
||||
|
||||
// phase1: select victims to relieve pressure
|
||||
//
|
||||
// Walk through the list of candidates, until we have accumulated enough layers to get
|
||||
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
|
||||
// how much disk space would be used after evicting all the layers up to the current
|
||||
// point in the list. The layers are collected in 'batched', grouped per timeline.
|
||||
//
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
debug!(
|
||||
no_candidates_evicted = i,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
if partition == MinResidentSizePartition::Below && warned.is_none() {
|
||||
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
warned = Some(usage_planned);
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.file_size());
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
.or_default()
|
||||
.push(candidate.layer);
|
||||
}
|
||||
|
||||
let usage_planned = match warned {
|
||||
Some(respecting_tenant_min_resident_size) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
};
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
// phase2: evict victims batched by timeline
|
||||
|
||||
// After the loop, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
for (timeline, batch) in batched {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let batch_size = batch.len();
|
||||
|
||||
debug!(%timeline_id, "evicting batch for timeline");
|
||||
|
||||
async {
|
||||
let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
|
||||
|
||||
match results {
|
||||
Err(e) => {
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
match result {
|
||||
Some(Ok(true)) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
// this is:
|
||||
// - Replacement::{NotFound, Unexpected}
|
||||
// - it cannot be is_remote_layer, filtered already
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// we really shouldn't be getting this, precondition failure
|
||||
error!("failed to evict layer: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
|
||||
.await;
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(IterationOutcome::Finished(IterationOutcomeFinished {
|
||||
before: usage_pre,
|
||||
planned: usage_planned,
|
||||
assumed: AssumedUsage {
|
||||
projected_after: usage_assumed,
|
||||
failed: evictions_failed,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
last_activity_ts: SystemTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum MinResidentSizePartition {
|
||||
Above,
|
||||
Below,
|
||||
}
|
||||
|
||||
enum EvictionCandidates {
|
||||
Cancelled,
|
||||
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
|
||||
}
|
||||
|
||||
/// Gather the eviction candidates.
|
||||
///
|
||||
/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
|
||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||
/// the eviction policy outlined in the module comment.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||
/// The eviction order would be
|
||||
///
|
||||
/// ```text
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||
/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
|
||||
///
|
||||
/// But, if we need to evict 900 bytes to relieve pressure, we'd evict
|
||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
async fn collect_eviction_candidates(
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for (tenant_id, _state) in &tenants {
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||
// for example because we're shutting down, then `max_layer_size` can be too small.
|
||||
// That's OK. This code only runs under a disk pressure situation, and being
|
||||
// a little unfair to tenants during shutdown in such a situation is tolerable.
|
||||
let mut tenant_candidates = Vec::new();
|
||||
let mut max_layer_size = 0;
|
||||
for tl in tenant.list_timelines() {
|
||||
if !tl.is_active() {
|
||||
continue;
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction();
|
||||
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
info.resident_layers
|
||||
.into_iter()
|
||||
.map(|layer_infos| (tl.clone(), layer_infos)),
|
||||
);
|
||||
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
}
|
||||
|
||||
// `min_resident_size` defaults to maximum layer file size of the tenant.
|
||||
// This ensures that each tenant can have at least one layer resident at a given time,
|
||||
// ensuring forward progress for a single Timeline::get in that tenant.
|
||||
// It's a questionable heuristic since, usually, there are many Timeline::get
|
||||
// requests going on for a tenant, and, at least in Neon prod, the median
|
||||
// layer file size is much smaller than the compaction target size.
|
||||
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
|
||||
// That's what's typically used by the various background loops.
|
||||
//
|
||||
// The default can be overriden with a fixed value in the tenant conf.
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
overriden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
} else {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
max_layer_size,
|
||||
"using max layer size as min_resident_size for tenant",
|
||||
);
|
||||
max_layer_size
|
||||
};
|
||||
|
||||
// Sort layers most-recently-used first, then partition by
|
||||
// cumsum above/below min_resident_size.
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
};
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
} else {
|
||||
MinResidentSizePartition::Below
|
||||
};
|
||||
candidates.push((partition, candidate));
|
||||
cumsum += i128::from(file_size);
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
|
||||
struct TimelineKey(Arc<Timeline>);
|
||||
|
||||
impl PartialEq for TimelineKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for TimelineKey {}
|
||||
|
||||
impl std::hash::Hash for TimelineKey {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
Arc::as_ptr(&self.0).hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TimelineKey {
|
||||
type Target = Timeline;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use crate::statvfs::Statvfs;
|
||||
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Usage<'a> {
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
|
||||
/// Filesystem capacity
|
||||
total_bytes: u64,
|
||||
/// Free filesystem space
|
||||
avail_bytes: u64,
|
||||
}
|
||||
|
||||
impl super::Usage for Usage<'_> {
|
||||
fn has_pressure(&self) -> bool {
|
||||
let usage_pct =
|
||||
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
|
||||
|
||||
let pressures = [
|
||||
(
|
||||
"min_avail_bytes",
|
||||
self.avail_bytes < self.config.min_avail_bytes,
|
||||
),
|
||||
(
|
||||
"max_usage_pct",
|
||||
usage_pct > self.config.max_usage_pct.get() as u64,
|
||||
),
|
||||
];
|
||||
|
||||
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
self.avail_bytes += bytes;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get<'a>(
|
||||
tenants_dir: &Path,
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
) -> anyhow::Result<Usage<'a>> {
|
||||
let mock_config = {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
config.mock_statvfs.as_ref()
|
||||
}
|
||||
#[cfg(not(feature = "testing"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let stat = Statvfs::get(tenants_dir, mock_config)
|
||||
.context("statvfs failed, presumably directory got unlinked")?;
|
||||
|
||||
// https://unix.stackexchange.com/a/703650
|
||||
let blocksize = if stat.fragment_size() > 0 {
|
||||
stat.fragment_size()
|
||||
} else {
|
||||
stat.block_size()
|
||||
};
|
||||
|
||||
// use blocks_available (b_avail) since, pageserver runs as unprivileged user
|
||||
let avail_bytes = stat.blocks_available() * blocksize;
|
||||
let total_bytes = stat.blocks() * blocksize;
|
||||
|
||||
Ok(Usage {
|
||||
config,
|
||||
total_bytes,
|
||||
avail_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -27,31 +27,6 @@ paths:
|
||||
id:
|
||||
type: integer
|
||||
|
||||
/v1/disk_usage_eviction/run:
|
||||
put:
|
||||
description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space.
|
||||
security: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- evict_bytes
|
||||
properties:
|
||||
evict_bytes:
|
||||
type: integer
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
The run completed.
|
||||
This does not necessarily mean that we actually evicted `evict_bytes`.
|
||||
Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -208,12 +183,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -382,13 +351,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: detach_ignored
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, allow to detach a tenant which state is ignored.
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
|
||||
@@ -414,12 +376,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -18,11 +18,10 @@ use super::models::{
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::disk_usage_eviction_task;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
|
||||
use crate::tenant::mgr::TenantMapInsertError;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
@@ -49,7 +48,6 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -57,7 +55,6 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
@@ -68,7 +65,6 @@ impl State {
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_storage,
|
||||
disk_usage_eviction_state,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -86,75 +82,38 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
||||
get_state(request).conf
|
||||
}
|
||||
|
||||
/// Check that the requester is authorized to operate on given tenant
|
||||
fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
|
||||
check_permission_with(request, |claims| {
|
||||
crate::auth::check_permission(claims, tenant_id)
|
||||
})
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for ApiError {
|
||||
fn from(pre: PageReconstructError) -> ApiError {
|
||||
match pre {
|
||||
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||
PageReconstructError::NeedsDownload(_, _) => {
|
||||
// This shouldn't happen, because we use a RequestContext that requests to
|
||||
// download any missing layer files on-demand.
|
||||
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
|
||||
}
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::WalRedo(pre) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
}
|
||||
fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
|
||||
match err {
|
||||
PageReconstructError::Other(err) => ApiError::InternalServerError(err),
|
||||
PageReconstructError::NeedsDownload(_, _) => {
|
||||
// This shouldn't happen, because we use a RequestContext that requests to
|
||||
// download any missing layer files on-demand.
|
||||
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
|
||||
}
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::WalRedo(err) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantMapInsertError> for ApiError {
|
||||
fn from(tmie: TenantMapInsertError) -> ApiError {
|
||||
match tmie {
|
||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(tmie))
|
||||
}
|
||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
||||
}
|
||||
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
|
||||
fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
|
||||
match e {
|
||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
|
||||
HasChildren => ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot delete timeline which has child timelines"
|
||||
)),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
match value {
|
||||
Tenant(t) => ApiError::from(t),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
||||
}
|
||||
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,7 +171,7 @@ fn build_timeline_info_common(
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum());
|
||||
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
@@ -258,7 +217,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -288,7 +249,9 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
@@ -304,7 +267,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
||||
Ok(response_data)
|
||||
}
|
||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
||||
.await?;
|
||||
@@ -323,7 +286,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -359,7 +324,10 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &ctx)
|
||||
.await
|
||||
.map_err(apierror_from_prerror)?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
@@ -384,7 +352,8 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||
@@ -403,7 +372,11 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
.await
|
||||
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
|
||||
// user and internal errors. Replace this with better handling once the error type permits
|
||||
// it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -411,13 +384,15 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||
mgr::detach_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -431,7 +406,8 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
let state = get_state(&request);
|
||||
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
@@ -444,7 +420,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -479,7 +458,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.layer_size_sum();
|
||||
current_physical_size += timeline.layer_size_sum().approximate_is_ok();
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
@@ -518,7 +497,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
@@ -766,16 +747,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = request_data.eviction_policy {
|
||||
tenant_conf.eviction_policy = Some(
|
||||
serde_json::from_value(eviction_policy)
|
||||
.context("parse field `eviction_policy`")
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
||||
|
||||
let target_tenant_id = request_data
|
||||
.new_tenant_id
|
||||
.map(TenantId::from)
|
||||
@@ -791,7 +762,8 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
||||
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
@@ -815,7 +787,9 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -907,26 +881,13 @@ async fn update_tenant_config_handler(
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||
#[cfg(feature = "testing")]
|
||||
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test");
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1063,7 +1024,9 @@ async fn active_timeline_of_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)
|
||||
@@ -1080,89 +1043,6 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
evict_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
config: Config,
|
||||
// updated by `add_available_bytes`
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
self.freed_bytes += bytes;
|
||||
}
|
||||
}
|
||||
|
||||
let config = json_request::<Config>(&mut r)
|
||||
.await
|
||||
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
|
||||
|
||||
let usage = Usage {
|
||||
config,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)))
|
||||
};
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let child_cancel = cancel.clone();
|
||||
let _g = cancel.drop_guard();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
"ondemand disk usage eviction",
|
||||
false,
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
&storage,
|
||||
usage,
|
||||
&child_cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
|
||||
|
||||
let _ = tx.send(res);
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1175,7 +1055,6 @@ pub fn make_router(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
@@ -1220,8 +1099,7 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_storage, disk_usage_eviction_state)
|
||||
.context("Failed to initialize router state")?,
|
||||
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
|
||||
.put(
|
||||
@@ -1302,13 +1180,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
|
||||
)
|
||||
.put("/v1/disk_usage_eviction/run", |r| {
|
||||
RequestSpan(disk_usage_eviction_run).handle(r)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/break",
|
||||
testing_api!("set tenant state to broken", handle_tenant_break),
|
||||
)
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ pub mod broker_client;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
@@ -13,7 +12,6 @@ pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod trace;
|
||||
|
||||
@@ -9,18 +9,22 @@ use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::state;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
/// path. In other words, operations that directly affect that latency of user
|
||||
/// queries.
|
||||
///
|
||||
/// The buckets capture the majority of latencies in the microsecond and
|
||||
/// millisecond range but also extend far enough up to distinguish "bad" from
|
||||
/// "really bad".
|
||||
const CRITICAL_OP_BUCKETS: &[f64] = &[
|
||||
0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
|
||||
0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
|
||||
1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
|
||||
];
|
||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||
/// latencies in the microsecond range but also extend far enough up to distinguish
|
||||
/// "bad" from "really bad".
|
||||
fn get_buckets_for_critical_operations() -> Vec<f64> {
|
||||
let buckets_per_digit = 5;
|
||||
let min_exponent = -6;
|
||||
let max_exponent = 2;
|
||||
|
||||
let mut buckets = vec![];
|
||||
// Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
|
||||
// because it's more numerically stable and doesn't result in numbers like 9.999999
|
||||
for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
|
||||
buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
|
||||
}
|
||||
buckets
|
||||
}
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
const STORAGE_TIME_OPERATIONS: &[&str] = &[
|
||||
@@ -51,15 +55,12 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds_global",
|
||||
"Time spent on storage operations",
|
||||
&["operation"],
|
||||
STORAGE_OP_BUCKETS.into(),
|
||||
get_buckets_for_critical_operations(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -70,7 +71,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
get_buckets_for_critical_operations(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -89,7 +90,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
get_buckets_for_critical_operations(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -194,101 +195,15 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions",
|
||||
"Number of layers evicted from the pageserver",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions_with_low_residence_duration",
|
||||
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
|
||||
Residence duration is determined using the `residence_duration_data_source`.",
|
||||
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
counter: Option<IntCounter>,
|
||||
}
|
||||
|
||||
pub struct EvictionsWithLowResidenceDurationBuilder {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDurationBuilder {
|
||||
pub fn new(data_source: &'static str, threshold: Duration) -> Self {
|
||||
Self {
|
||||
data_source,
|
||||
threshold,
|
||||
}
|
||||
}
|
||||
|
||||
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
|
||||
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||
.get_metric_with_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
|
||||
])
|
||||
.unwrap();
|
||||
EvictionsWithLowResidenceDuration {
|
||||
data_source: self.data_source,
|
||||
threshold: self.threshold,
|
||||
counter: Some(counter),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDuration {
|
||||
fn threshold_label_value(threshold: Duration) -> String {
|
||||
format!("{}", threshold.as_secs())
|
||||
}
|
||||
|
||||
pub fn observe(&self, observed_value: Duration) {
|
||||
if self.threshold < observed_value {
|
||||
self.counter
|
||||
.as_ref()
|
||||
.expect("nobody calls this function after `remove_from_vec`")
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
|
||||
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
|
||||
let Some(_counter) = self.counter.take() else {
|
||||
return;
|
||||
};
|
||||
EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||
.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&Self::threshold_label_value(self.threshold),
|
||||
])
|
||||
.expect("we own the metric, no-one else should remove it");
|
||||
}
|
||||
}
|
||||
|
||||
// Metrics collected on disk IO operations
|
||||
//
|
||||
// Roughly logarithmic scale.
|
||||
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
0.000030, // 30 usec
|
||||
0.001000, // 1000 usec
|
||||
0.030, // 30 ms
|
||||
1.000, // 1000 ms
|
||||
0.000001, // 1 usec
|
||||
0.00001, // 10 usec
|
||||
0.0001, // 100 usec
|
||||
0.001, // 1 msec
|
||||
0.01, // 10 msec
|
||||
0.1, // 100 msec
|
||||
1.0, // 1 sec
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
@@ -323,12 +238,20 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
|
||||
"get_db_size",
|
||||
];
|
||||
|
||||
const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
|
||||
0.00001, // 1/100000 s
|
||||
0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
|
||||
0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
|
||||
0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
|
||||
0.1, // 1/10 s
|
||||
];
|
||||
|
||||
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
SMGR_QUERY_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -586,6 +509,7 @@ pub struct TimelineMetrics {
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
pub init_logical_size_histo: StorageTimeMetrics,
|
||||
pub logical_size_histo: StorageTimeMetrics,
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
@@ -596,16 +520,10 @@ pub struct TimelineMetrics {
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
pub persistent_bytes_written: IntCounter,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
pub fn new(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||
) -> Self {
|
||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
@@ -618,6 +536,8 @@ impl TimelineMetrics {
|
||||
let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
|
||||
let create_images_time_histo =
|
||||
StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
|
||||
let init_logical_size_histo =
|
||||
StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
|
||||
let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
|
||||
let load_layer_map_histo =
|
||||
StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
|
||||
@@ -640,11 +560,6 @@ impl TimelineMetrics {
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
@@ -654,6 +569,7 @@ impl TimelineMetrics {
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
init_logical_size_histo,
|
||||
logical_size_histo,
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
@@ -663,8 +579,6 @@ impl TimelineMetrics {
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
evictions,
|
||||
evictions_with_low_residence_duration,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -681,9 +595,7 @@ impl Drop for TimelineMetrics {
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
self.evictions_with_low_residence_duration
|
||||
.remove(tenant_id, timeline_id);
|
||||
|
||||
for op in STORAGE_TIME_OPERATIONS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -718,7 +630,7 @@ use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Instant;
|
||||
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
|
||||
@@ -1107,10 +1107,7 @@ async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => return Err(GetActiveTenantError::Other(e.into())),
|
||||
};
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let wait_time = Duration::from_secs(30);
|
||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
||||
Ok(Ok(())) => Ok(tenant),
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
pub enum Statvfs {
|
||||
Real(nix::sys::statvfs::Statvfs),
|
||||
Mock(mock::Statvfs),
|
||||
}
|
||||
|
||||
// NB: on macOS, the block count type of struct statvfs is u32.
|
||||
// The workaround seems to be to use the non-standard statfs64 call.
|
||||
// Sincce it should only be a problem on > 2TiB disks, let's ignore
|
||||
// the problem for now and upcast to u64.
|
||||
impl Statvfs {
|
||||
pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
|
||||
if let Some(mocked) = mocked {
|
||||
Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
|
||||
} else {
|
||||
Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
|
||||
}
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
|
||||
Statvfs::Mock(stat) => stat.blocks,
|
||||
}
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks_available(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
|
||||
Statvfs::Mock(stat) => stat.blocks_available,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fragment_size(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => stat.fragment_size(),
|
||||
Statvfs::Mock(stat) => stat.fragment_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn block_size(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => stat.block_size(),
|
||||
Statvfs::Mock(stat) => stat.block_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock {
|
||||
use anyhow::Context;
|
||||
use regex::Regex;
|
||||
use std::path::Path;
|
||||
use tracing::log::info;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum Behavior {
|
||||
Success {
|
||||
blocksize: u64,
|
||||
total_blocks: u64,
|
||||
name_filter: Option<utils::serde_regex::Regex>,
|
||||
},
|
||||
Failure {
|
||||
mocked_error: MockedError,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[allow(clippy::upper_case_acronyms)]
|
||||
pub enum MockedError {
|
||||
EIO,
|
||||
}
|
||||
|
||||
impl From<MockedError> for nix::Error {
|
||||
fn from(e: MockedError) -> Self {
|
||||
match e {
|
||||
MockedError::EIO => nix::Error::EIO,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
|
||||
info!("running mocked statvfs");
|
||||
|
||||
match behavior {
|
||||
Behavior::Success {
|
||||
blocksize,
|
||||
total_blocks,
|
||||
ref name_filter,
|
||||
} => {
|
||||
let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
|
||||
|
||||
// round it up to the nearest block multiple
|
||||
let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
|
||||
|
||||
if used_blocks > *total_blocks {
|
||||
panic!(
|
||||
"mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}"
|
||||
);
|
||||
}
|
||||
|
||||
let avail_blocks = total_blocks - used_blocks;
|
||||
|
||||
Ok(Statvfs {
|
||||
blocks: *total_blocks,
|
||||
blocks_available: avail_blocks,
|
||||
fragment_size: *blocksize,
|
||||
block_size: *blocksize,
|
||||
})
|
||||
}
|
||||
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
|
||||
let mut total = 0;
|
||||
for entry in walkdir::WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
if !entry.file_type().is_file() {
|
||||
continue;
|
||||
}
|
||||
if !name_filter
|
||||
.as_ref()
|
||||
.map(|filter| filter.is_match(entry.file_name().to_str().unwrap()))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
total += entry
|
||||
.metadata()
|
||||
.with_context(|| format!("get metadata of {:?}", entry.path()))?
|
||||
.len();
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
pub struct Statvfs {
|
||||
pub blocks: u64,
|
||||
pub blocks_available: u64,
|
||||
pub fragment_size: u64,
|
||||
pub block_size: u64,
|
||||
}
|
||||
}
|
||||
@@ -234,9 +234,6 @@ pub enum TaskKind {
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
@@ -484,25 +481,13 @@ pub async fn shutdown_tasks(
|
||||
for task in victim_tasks {
|
||||
let join_handle = {
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
task_mut.join_handle.take()
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
let join_handle = task_mut.join_handle.take();
|
||||
drop(task_mut);
|
||||
join_handle
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
let completed = tokio::select! {
|
||||
_ = &mut join_handle => { true },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
false
|
||||
}
|
||||
};
|
||||
if !completed {
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
// - task errors are already logged in the wrapper
|
||||
let _ = join_handle.await;
|
||||
}
|
||||
if let Some(join_handle) = join_handle {
|
||||
let _ = join_handle.await;
|
||||
} else {
|
||||
// Possibly one of:
|
||||
// * The task had not even fully started yet.
|
||||
|
||||
@@ -94,7 +94,7 @@ mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
|
||||
pub use timeline::{PageReconstructError, Timeline};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
@@ -431,16 +431,6 @@ remote:
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
#[error("HasChildren")]
|
||||
HasChildren,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
@@ -488,7 +478,7 @@ impl Tenant {
|
||||
|
||||
let dummy_timeline = self.create_timeline_data(
|
||||
timeline_id,
|
||||
up_to_date_metadata,
|
||||
up_to_date_metadata.clone(),
|
||||
ancestor.clone(),
|
||||
remote_client,
|
||||
)?;
|
||||
@@ -513,7 +503,7 @@ impl Tenant {
|
||||
let broken_timeline = self
|
||||
.create_timeline_data(
|
||||
timeline_id,
|
||||
up_to_date_metadata,
|
||||
up_to_date_metadata.clone(),
|
||||
ancestor.clone(),
|
||||
None,
|
||||
)
|
||||
@@ -1152,7 +1142,7 @@ impl Tenant {
|
||||
);
|
||||
self.prepare_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
new_metadata,
|
||||
timeline_uninit_mark,
|
||||
true,
|
||||
None,
|
||||
@@ -1317,7 +1307,7 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
@@ -1329,13 +1319,13 @@ impl Tenant {
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
|
||||
if children_exist {
|
||||
return Err(DeleteTimelineError::HasChildren);
|
||||
}
|
||||
|
||||
anyhow::ensure!(
|
||||
!children_exist,
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
let timeline = Arc::clone(timeline_entry.get());
|
||||
@@ -1703,13 +1693,6 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.min_resident_size_override
|
||||
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
||||
}
|
||||
|
||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
*self.tenant_conf.write().unwrap() = new_tenant_conf;
|
||||
}
|
||||
@@ -1717,7 +1700,7 @@ impl Tenant {
|
||||
fn create_timeline_data(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
@@ -2177,25 +2160,13 @@ impl Tenant {
|
||||
let new_timeline = self
|
||||
.prepare_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
metadata,
|
||||
timeline_uninit_mark,
|
||||
false,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)?
|
||||
.initialize_with_lock(&mut timelines, true, true)?;
|
||||
drop(timelines);
|
||||
|
||||
// Root timeline gets its layers during creation and uploads them along with the metadata.
|
||||
// A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
|
||||
// We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
|
||||
// could get incorrect information and remove more layers, than needed.
|
||||
// See also https://github.com/neondatabase/neon/issues/3865
|
||||
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
|
||||
remote_client
|
||||
.schedule_index_upload_for_metadata_update(&metadata)
|
||||
.context("branch initial metadata upload")?;
|
||||
}
|
||||
|
||||
info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
|
||||
|
||||
Ok(new_timeline)
|
||||
@@ -2258,7 +2229,7 @@ impl Tenant {
|
||||
pg_version,
|
||||
);
|
||||
let raw_timeline =
|
||||
self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;
|
||||
self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?;
|
||||
|
||||
let tenant_id = raw_timeline.owning_tenant.tenant_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
@@ -2312,7 +2283,7 @@ impl Tenant {
|
||||
fn prepare_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
uninit_mark: TimelineUninitMark,
|
||||
init_layers: bool,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
@@ -2326,7 +2297,7 @@ impl Tenant {
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
);
|
||||
remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
|
||||
remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
|
||||
Some(remote_client)
|
||||
} else {
|
||||
None
|
||||
@@ -2365,12 +2336,17 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_path: &Path,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_data = self
|
||||
.create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
|
||||
.create_timeline_data(
|
||||
new_timeline_id,
|
||||
new_metadata.clone(),
|
||||
ancestor,
|
||||
remote_client,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
@@ -2382,7 +2358,7 @@ impl Tenant {
|
||||
self.conf,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
new_metadata,
|
||||
&new_metadata,
|
||||
true,
|
||||
)
|
||||
.context("Failed to create timeline metadata")?;
|
||||
@@ -2786,7 +2762,6 @@ pub mod harness {
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
||||
min_resident_size_override: tenant_conf.min_resident_size_override,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +92,6 @@ pub struct TenantConf {
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
pub eviction_policy: EvictionPolicy,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -160,10 +159,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -225,9 +220,48 @@ impl TenantConfOpt {
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
||||
min_resident_size_override: self
|
||||
.min_resident_size_override
|
||||
.or(global_conf.min_resident_size_override),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&mut self, other: &TenantConfOpt) {
|
||||
if let Some(checkpoint_distance) = other.checkpoint_distance {
|
||||
self.checkpoint_distance = Some(checkpoint_distance);
|
||||
}
|
||||
if let Some(checkpoint_timeout) = other.checkpoint_timeout {
|
||||
self.checkpoint_timeout = Some(checkpoint_timeout);
|
||||
}
|
||||
if let Some(compaction_target_size) = other.compaction_target_size {
|
||||
self.compaction_target_size = Some(compaction_target_size);
|
||||
}
|
||||
if let Some(compaction_period) = other.compaction_period {
|
||||
self.compaction_period = Some(compaction_period);
|
||||
}
|
||||
if let Some(compaction_threshold) = other.compaction_threshold {
|
||||
self.compaction_threshold = Some(compaction_threshold);
|
||||
}
|
||||
if let Some(gc_horizon) = other.gc_horizon {
|
||||
self.gc_horizon = Some(gc_horizon);
|
||||
}
|
||||
if let Some(gc_period) = other.gc_period {
|
||||
self.gc_period = Some(gc_period);
|
||||
}
|
||||
if let Some(image_creation_threshold) = other.image_creation_threshold {
|
||||
self.image_creation_threshold = Some(image_creation_threshold);
|
||||
}
|
||||
if let Some(pitr_interval) = other.pitr_interval {
|
||||
self.pitr_interval = Some(pitr_interval);
|
||||
}
|
||||
if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout {
|
||||
self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout);
|
||||
}
|
||||
if let Some(lagging_wal_timeout) = other.lagging_wal_timeout {
|
||||
self.lagging_wal_timeout = Some(lagging_wal_timeout);
|
||||
}
|
||||
if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
|
||||
self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = other.trace_read_requests {
|
||||
self.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -259,7 +293,6 @@ impl Default for TenantConf {
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
eviction_policy: EvictionPolicy::NoEviction,
|
||||
min_resident_size_override: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,7 +289,7 @@ pub async fn set_new_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
|
||||
@@ -306,84 +306,50 @@ pub async fn set_new_tenant_config(
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub async fn get_tenant(
|
||||
tenant_id: TenantId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, TenantStateError> {
|
||||
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = TENANTS.read().await;
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?;
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
Err(TenantStateError::NotActive(tenant_id))
|
||||
anyhow::bail!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
Tenant(#[from] TenantStateError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] crate::tenant::DeleteTimelineError),
|
||||
}
|
||||
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
tenant.delete_timeline(timeline_id, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
) -> anyhow::Result<()> {
|
||||
match get_tenant(tenant_id, true).await {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id, ctx).await?;
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum TenantStateError {
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantId),
|
||||
#[error("Tenant {0} is stopping")]
|
||||
IsStopping(TenantId),
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantId),
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} removal")
|
||||
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
|
||||
})?;
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let removal_result =
|
||||
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
local_files_cleanup_operation(tenant_id)
|
||||
.await
|
||||
.with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
removal_result
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn load_tenant(
|
||||
@@ -413,7 +379,7 @@ pub async fn load_tenant(
|
||||
pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
@@ -523,7 +489,7 @@ where
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> Result<V, TenantStateError>
|
||||
) -> anyhow::Result<V>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
@@ -539,9 +505,11 @@ where
|
||||
| TenantState::Loading
|
||||
| TenantState::Broken
|
||||
| TenantState::Active => tenant.set_stopping(),
|
||||
TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
|
||||
TenantState::Stopping => {
|
||||
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
||||
}
|
||||
},
|
||||
None => return Err(TenantStateError::NotFound(tenant_id)),
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -564,15 +532,10 @@ where
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.set_broken(&e.to_string());
|
||||
}
|
||||
None => {
|
||||
warn!("Tenant {tenant_id} got removed from memory");
|
||||
return Err(TenantStateError::NotFound(tenant_id));
|
||||
}
|
||||
Some(tenant) => tenant.set_broken(&e.to_string()),
|
||||
None => warn!("Tenant {tenant_id} got removed from memory"),
|
||||
}
|
||||
Err(TenantStateError::Other(e))
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -592,7 +555,7 @@ pub async fn immediate_gc(
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
@@ -642,7 +605,7 @@ pub async fn immediate_compact(
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline = tenant
|
||||
|
||||
@@ -210,6 +210,7 @@ pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
@@ -346,7 +347,7 @@ impl RemoteTimelineClient {
|
||||
.layer_metadata
|
||||
.values()
|
||||
// If we don't have the file size for the layer, don't account for it in the metric.
|
||||
.map(|ilmd| ilmd.file_size)
|
||||
.map(|ilmd| ilmd.file_size.unwrap_or(0))
|
||||
.sum()
|
||||
} else {
|
||||
0
|
||||
@@ -419,6 +420,34 @@ impl RemoteTimelineClient {
|
||||
.await?
|
||||
};
|
||||
|
||||
// Update the metadata for given layer file. The remote index file
|
||||
// might be missing some information for the file; this allows us
|
||||
// to fill in the missing details.
|
||||
if layer_metadata.file_size().is_none() {
|
||||
let new_metadata = LayerFileMetadata::new(downloaded_size);
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
|
||||
if upgraded.merge(&new_metadata) {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
// If we don't do an index file upload inbetween here and restart,
|
||||
// the value will go back down after pageserver restart, since we will
|
||||
// have lost this data point.
|
||||
// But, we upload index part fairly frequently, and restart pageserver rarely.
|
||||
// So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
|
||||
self.metrics
|
||||
.remote_physical_size_gauge()
|
||||
.add(downloaded_size);
|
||||
} else {
|
||||
// The file should exist, since we just downloaded it.
|
||||
warn!(
|
||||
"downloaded file {:?} not found in local copy of the index file",
|
||||
layer_file_name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
|
||||
REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
|
||||
|
||||
@@ -521,6 +550,13 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// The file size can be missing for files that were created before we tracked that
|
||||
// in the metadata, but it should be present for any new files we create.
|
||||
ensure!(
|
||||
layer_metadata.file_size().is_some(),
|
||||
"file size not initialized in metadata"
|
||||
);
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer_file_name.clone(), layer_metadata.clone());
|
||||
|
||||
@@ -21,7 +21,7 @@ use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
@@ -113,11 +113,16 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
|
||||
)));
|
||||
match layer_metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
)));
|
||||
}
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
@@ -256,12 +261,14 @@ pub(super) async fn download_index_part(
|
||||
)
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part = index_part.remove_unclean_layer_file_names();
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -19,7 +20,7 @@ use utils::lsn::Lsn;
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
@@ -32,16 +33,36 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64) -> Self {
|
||||
LayerFileMetadata { file_size }
|
||||
LayerFileMetadata {
|
||||
file_size: Some(file_size),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
/// This is used to initialize the metadata for remote layers, for which
|
||||
/// the metadata was missing from the index part file.
|
||||
pub const MISSING: Self = LayerFileMetadata { file_size: None };
|
||||
|
||||
pub fn file_size(&self) -> Option<u64> {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
|
||||
/// other value.
|
||||
///
|
||||
/// This is called on the possibly outdated version. Returns true if any changes
|
||||
/// were made.
|
||||
pub fn merge(&mut self, other: &Self) -> bool {
|
||||
let mut changed = false;
|
||||
|
||||
if self.file_size != other.file_size {
|
||||
self.file_size = other.file_size.or(self.file_size);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
changed
|
||||
}
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
@@ -50,7 +71,10 @@ impl LayerFileMetadata {
|
||||
/// remember to add a test case for the changed version.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
pub struct IndexPartImpl<L>
|
||||
where
|
||||
L: std::hash::Hash + PartialEq + Eq,
|
||||
{
|
||||
/// Debugging aid describing the version of this type.
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
@@ -58,13 +82,14 @@ pub struct IndexPart {
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<LayerFileName>,
|
||||
pub timeline_layers: HashSet<L>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
|
||||
#[serde(default = "HashMap::default")]
|
||||
pub layer_metadata: HashMap<L, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
@@ -73,6 +98,101 @@ pub struct IndexPart {
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
pub type IndexPart = IndexPartImpl<LayerFileName>;
|
||||
|
||||
pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum UncleanLayerFileName {
|
||||
Clean(LayerFileName),
|
||||
BackupFile(String),
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(UncleanLayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct UncleanLayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
|
||||
type Value = UncleanLayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid LayerFileName or '.old' backup file name"
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
let maybe_clean: Result<LayerFileName, _> = v.parse();
|
||||
match maybe_clean {
|
||||
Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
|
||||
Err(e) => {
|
||||
if v.ends_with(".old") || v == "metadata_backup" {
|
||||
Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
|
||||
} else {
|
||||
Err(E::custom(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UncleanLayerFileName {
|
||||
fn into_clean(self) -> Option<LayerFileName> {
|
||||
match self {
|
||||
UncleanLayerFileName::Clean(clean) => Some(clean),
|
||||
UncleanLayerFileName::BackupFile(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPartUnclean {
|
||||
pub fn remove_unclean_layer_file_names(self) -> IndexPart {
|
||||
let IndexPartUnclean {
|
||||
version,
|
||||
timeline_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
} = self;
|
||||
|
||||
IndexPart {
|
||||
version,
|
||||
timeline_layers: timeline_layers
|
||||
.into_iter()
|
||||
.filter_map(|unclean_file_name| match unclean_file_name {
|
||||
UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
|
||||
UncleanLayerFileName::BackupFile(backup_file_name) => {
|
||||
// For details see https://github.com/neondatabase/neon/issues/3024
|
||||
warn!(
|
||||
"got backup file on the remote storage, ignoring it {backup_file_name}"
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
layer_metadata: layer_metadata
|
||||
.into_iter()
|
||||
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
|
||||
.collect(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
@@ -112,7 +232,7 @@ impl IndexPart {
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
pub(super) file_size: u64,
|
||||
pub(super) file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
@@ -127,6 +247,27 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
@@ -146,19 +287,21 @@ mod tests {
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example)
|
||||
.unwrap()
|
||||
.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -182,64 +325,20 @@ mod tests {
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_layers_are_parsed() {
|
||||
let empty_layers_json = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":[],
|
||||
"layer_metadata":{},
|
||||
"disk_consistent_lsn":"0/2532648",
|
||||
"metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 1,
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::new(),
|
||||
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [
|
||||
136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
|
||||
38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
|
||||
210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
|
||||
240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,
|
||||
]
|
||||
.to_vec(),
|
||||
};
|
||||
|
||||
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
|
||||
|
||||
assert_eq!(empty_layers_parsed, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,9 +64,13 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
})?
|
||||
.len();
|
||||
|
||||
let metadata_size = known_metadata.file_size();
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
// FIXME: this looks bad
|
||||
if let Some(metadata_size) = known_metadata.file_size() {
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
}
|
||||
} else {
|
||||
// this is a silly state we would like to avoid
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size).with_context(|| {
|
||||
|
||||
@@ -121,10 +121,10 @@ struct LayerAccessStatsInner {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct LayerAccessStatFullDetails {
|
||||
pub(crate) when: SystemTime,
|
||||
pub(crate) task_kind: TaskKind,
|
||||
pub(crate) access_kind: LayerAccessKind,
|
||||
pub(super) struct LayerAccessStatFullDetails {
|
||||
pub(super) when: SystemTime,
|
||||
pub(super) task_kind: TaskKind,
|
||||
pub(super) access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, strum_macros::EnumString)]
|
||||
@@ -255,7 +255,7 @@ impl LayerAccessStats {
|
||||
ret
|
||||
}
|
||||
|
||||
fn most_recent_access_or_residence_event(
|
||||
pub(super) fn most_recent_access_or_residence_event(
|
||||
&self,
|
||||
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
@@ -268,13 +268,6 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn latest_activity(&self) -> SystemTime {
|
||||
match self.most_recent_access_or_residence_event() {
|
||||
Either::Left(mra) => mra.when,
|
||||
Either::Right(re) => re.timestamp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
@@ -385,7 +378,7 @@ pub trait PersistentLayer: Layer {
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> u64;
|
||||
fn file_size(&self) -> Option<u64>;
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
|
||||
@@ -444,8 +444,8 @@ impl PersistentLayer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
@@ -456,7 +456,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: false,
|
||||
|
||||
@@ -258,15 +258,6 @@ impl serde::Serialize for LayerFileName {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for LayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(LayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
|
||||
@@ -258,8 +258,8 @@ impl PersistentLayer for ImageLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
@@ -268,7 +268,7 @@ impl PersistentLayer for ImageLayer {
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
|
||||
@@ -167,7 +167,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
self.layer_metadata.file_size()
|
||||
}
|
||||
|
||||
|
||||
@@ -244,12 +244,14 @@ pub(crate) async fn random_init_delay(
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
if period == Duration::ZERO {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
||||
|
||||
// semi-ok default as the source of jitter
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -72,8 +71,6 @@ use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
use walreceiver::spawn_connection_manager_task;
|
||||
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
@@ -219,8 +216,6 @@ pub struct Timeline {
|
||||
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
|
||||
|
||||
state: watch::Sender<TimelineState>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
@@ -333,12 +328,27 @@ impl LogicalSize {
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
}
|
||||
|
||||
/// Make the value computed by initial logical size computation
|
||||
/// available for re-use. This doesn't contain the incremental part.
|
||||
fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||
match self.initial_part_end {
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
|
||||
_ => None,
|
||||
/// Returns the initialized (already calculated) value, if any.
|
||||
fn initialized_size(&self) -> Option<u64> {
|
||||
self.initial_logical_size.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returned by [`Timeline::layer_size_sum`]
|
||||
pub enum LayerSizeSum {
|
||||
/// The result is accurate.
|
||||
Accurate(u64),
|
||||
// We don't know the layer file size of one or more layers.
|
||||
// They contribute to the sum with a value of 0.
|
||||
// Hence, the sum is a lower bound for the actualy layer file size sum.
|
||||
ApproximateLowerBound(u64),
|
||||
}
|
||||
|
||||
impl LayerSizeSum {
|
||||
pub fn approximate_is_ok(self) -> u64 {
|
||||
match self {
|
||||
LayerSizeSum::Accurate(v) => v,
|
||||
LayerSizeSum::ApproximateLowerBound(v) => v,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -540,13 +550,20 @@ impl Timeline {
|
||||
/// The sum of the file size of all historic layers in the layer map.
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub fn layer_size_sum(&self) -> u64 {
|
||||
pub fn layer_size_sum(&self) -> LayerSizeSum {
|
||||
let layer_map = self.layers.read().unwrap();
|
||||
let mut size = 0;
|
||||
let mut no_size_cnt = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size();
|
||||
let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
|
||||
size += l_size;
|
||||
no_size_cnt += l_no_size;
|
||||
}
|
||||
if no_size_cnt == 0 {
|
||||
LayerSizeSum::Accurate(size)
|
||||
} else {
|
||||
LayerSizeSum::ApproximateLowerBound(size)
|
||||
}
|
||||
size
|
||||
}
|
||||
|
||||
pub fn get_resident_physical_size(&self) -> u64 {
|
||||
@@ -815,11 +832,11 @@ impl Timeline {
|
||||
|
||||
let mut is_exact = true;
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
is_exact = false;
|
||||
self.try_spawn_size_init_task(initial_part_end, ctx);
|
||||
self.try_spawn_size_init_task(init_lsn, ctx);
|
||||
}
|
||||
|
||||
Ok((size, is_exact))
|
||||
@@ -957,25 +974,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Evict a batch of layers.
|
||||
///
|
||||
/// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
|
||||
///
|
||||
/// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
|
||||
pub async fn evict_layers(
|
||||
&self,
|
||||
_: &GenericRemoteStorage,
|
||||
layers_to_evict: &[Arc<dyn PersistentLayer>],
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
|
||||
let remote_client = self.remote_client.clone().expect(
|
||||
"GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
|
||||
);
|
||||
|
||||
self.evict_layer_batch(&remote_client, layers_to_evict, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Evict multiple layers at once, continuing through errors.
|
||||
///
|
||||
/// Try to evict the given `layers_to_evict` by
|
||||
@@ -1013,15 +1011,6 @@ impl Timeline {
|
||||
// now lock out layer removal (compaction, gc, timeline deletion)
|
||||
let layer_removal_guard = self.layer_removal_cs.lock().await;
|
||||
|
||||
{
|
||||
// to avoid racing with detach and delete_timeline
|
||||
let state = self.current_state();
|
||||
anyhow::ensure!(
|
||||
state == TimelineState::Active,
|
||||
"timeline is not active but {state:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// start the batch update
|
||||
let mut layer_map = self.layers.write().unwrap();
|
||||
let mut batch_updates = layer_map.batch_update();
|
||||
@@ -1055,28 +1044,12 @@ impl Timeline {
|
||||
use super::layer_map::Replacement;
|
||||
|
||||
if local_layer.is_remote_layer() {
|
||||
// TODO(issue #3851): consider returning an err here instead of false,
|
||||
// which is the same out the match later
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let layer_file_size = local_layer.file_size();
|
||||
|
||||
let local_layer_mtime = local_layer
|
||||
.local_path()
|
||||
.expect("local layer should have a local path")
|
||||
.metadata()
|
||||
.context("get local layer file stat")?
|
||||
.modified()
|
||||
.context("get mtime of layer file")?;
|
||||
let local_layer_residence_duration =
|
||||
match SystemTime::now().duration_since(local_layer_mtime) {
|
||||
Err(e) => {
|
||||
warn!("layer mtime is in the future: {}", e);
|
||||
None
|
||||
}
|
||||
Ok(delta) => Some(delta),
|
||||
};
|
||||
let layer_file_size = local_layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(layer_file_size);
|
||||
|
||||
@@ -1120,14 +1093,6 @@ impl Timeline {
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
|
||||
self.metrics.evictions.inc();
|
||||
|
||||
if let Some(delta) = local_layer_residence_duration {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.observe(delta);
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
Replacement::NotFound => {
|
||||
@@ -1202,7 +1167,7 @@ impl Timeline {
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
metadata: &TimelineMetadata,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -1243,14 +1208,7 @@ impl Timeline {
|
||||
ancestor_timeline: ancestor,
|
||||
ancestor_lsn: metadata.ancestor_lsn(),
|
||||
|
||||
metrics: TimelineMetrics::new(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
||||
"mtime",
|
||||
conf.evictions_low_residence_duration_metric_threshold,
|
||||
),
|
||||
),
|
||||
metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
@@ -1287,10 +1245,6 @@ impl Timeline {
|
||||
download_all_remote_layers_task_info: RwLock::new(None),
|
||||
|
||||
state,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
@@ -1381,7 +1335,6 @@ impl Timeline {
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
||||
self.conf.availability_zone.clone(),
|
||||
background_ctx,
|
||||
);
|
||||
}
|
||||
@@ -1529,12 +1482,7 @@ impl Timeline {
|
||||
.layer_metadata
|
||||
.get(remote_layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
remote_layer_name.file_name()
|
||||
)
|
||||
})?;
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
|
||||
// Is the local layer's size different from the size stored in the
|
||||
// remote index file?
|
||||
@@ -1550,27 +1498,34 @@ impl Timeline {
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let remote_size = remote_layer_metadata.file_size();
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
if let Some(remote_size) = remote_layer_metadata.file_size() {
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
"layer is present locally and remote does not have file size, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
@@ -1672,8 +1627,6 @@ impl Timeline {
|
||||
.map(|l| (l.filename(), l))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
// If no writes happen, new branches do not have any layers, only the metadata file.
|
||||
let has_local_layers = !local_layers.is_empty();
|
||||
let local_only_layers = match index_part {
|
||||
Some(index_part) => {
|
||||
info!(
|
||||
@@ -1691,47 +1644,28 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
if has_local_layers {
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them.
|
||||
// Local timeline metadata will get uploaded to remove along witht he layers.
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
} else if index_part.is_none() {
|
||||
// No data on the remote storage, no local layers, local metadata file.
|
||||
//
|
||||
// TODO https://github.com/neondatabase/neon/issues/3865
|
||||
// Currently, console does not wait for the timeline data upload to the remote storage
|
||||
// and considers the timeline created, expecting other pageserver nodes to work with it.
|
||||
// Branch metadata upload could get interrupted (e.g pageserver got killed),
|
||||
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
|
||||
// otherwise any other pageserver won't see the branch on `attach`.
|
||||
//
|
||||
// After the issue gets implemented, pageserver should rather remove the branch,
|
||||
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
|
||||
// no need to keep the old files.
|
||||
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
} else {
|
||||
// Local timeline has a metadata file, remote one too, both have no layers to sync.
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
|
||||
info!("Done");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
|
||||
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
|
||||
.try_acquire_owned()
|
||||
{
|
||||
@@ -1769,7 +1703,7 @@ impl Timeline {
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, &background_ctx)
|
||||
.logical_size_calculation_task(init_lsn, &background_ctx)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
@@ -1854,7 +1788,7 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
init_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
let mut timeline_state_updates = self.subscribe_for_state_updates();
|
||||
@@ -1865,7 +1799,7 @@ impl Timeline {
|
||||
let cancel = cancel.child_token();
|
||||
let ctx = ctx.attached_child();
|
||||
self_calculation
|
||||
.calculate_logical_size(lsn, cancel, &ctx)
|
||||
.calculate_logical_size(init_lsn, cancel, &ctx)
|
||||
.await
|
||||
};
|
||||
let timeline_state_cancellation = async {
|
||||
@@ -1949,12 +1883,21 @@ impl Timeline {
|
||||
// need to return something
|
||||
Ok(0)
|
||||
});
|
||||
// See if we've already done the work for initial size calculation.
|
||||
// This is a short-cut for timelines that are mostly unused.
|
||||
if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
|
||||
return Ok(size);
|
||||
}
|
||||
let timer = self.metrics.logical_size_histo.start_timer();
|
||||
let timer = if up_to_lsn == self.initdb_lsn {
|
||||
if let Some(size) = self.current_logical_size.initialized_size() {
|
||||
if size != 0 {
|
||||
// non-zero size means that the size has already been calculated by this method
|
||||
// after startup. if the logical size is for a new timeline without layers the
|
||||
// size will be zero, and we cannot use that, or this caching strategy until
|
||||
// pageserver restart.
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
|
||||
self.metrics.init_logical_size_histo.start_timer()
|
||||
} else {
|
||||
self.metrics.logical_size_histo.start_timer()
|
||||
};
|
||||
let logical_size = self
|
||||
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
|
||||
.await?;
|
||||
@@ -2009,7 +1952,9 @@ impl Timeline {
|
||||
) -> anyhow::Result<()> {
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer.file_size();
|
||||
let layer_file_size = layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
@@ -4037,67 +3982,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DiskUsageEvictionInfo {
|
||||
/// Timeline's largest layer (remote or resident)
|
||||
pub max_layer_size: Option<u64>,
|
||||
/// Timeline's resident layers
|
||||
pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
|
||||
}
|
||||
|
||||
pub struct LocalLayerInfoForDiskUsageEviction {
|
||||
pub layer: Arc<dyn PersistentLayer>,
|
||||
pub last_activity_ts: SystemTime,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
|
||||
// having to allocate a string to this is bad, but it will rarely be formatted
|
||||
let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
|
||||
let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
|
||||
f.debug_struct("LocalLayerInfoForDiskUsageEviction")
|
||||
.field("layer", &self.layer)
|
||||
.field("last_activity", &ts)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl LocalLayerInfoForDiskUsageEviction {
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.layer.file_size()
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
let mut resident_layers = Vec::new();
|
||||
|
||||
for l in layers.iter_historic_layers() {
|
||||
let file_size = l.file_size();
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
if l.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_activity_ts = l.access_stats().latest_activity();
|
||||
|
||||
resident_layers.push(LocalLayerInfoForDiskUsageEviction {
|
||||
layer: l,
|
||||
last_activity_ts,
|
||||
});
|
||||
}
|
||||
|
||||
DiskUsageEvictionInfo {
|
||||
max_layer_size,
|
||||
resident_layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (
|
||||
ValueReconstructResult,
|
||||
Lsn,
|
||||
|
||||
@@ -1,30 +1,17 @@
|
||||
//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
|
||||
//! than a given threshold.
|
||||
//!
|
||||
//! Data includes all kinds of caches, namely:
|
||||
//! - (in-memory layers)
|
||||
//! - on-demand downloaded layer files on disk
|
||||
//! - (cached layer file pages)
|
||||
//! - derived data from layer file contents, namely:
|
||||
//! - initial logical size
|
||||
//! - partitioning
|
||||
//! - (other currently missing unknowns)
|
||||
//!
|
||||
//! Items with parentheses are not (yet) touched by this task.
|
||||
//!
|
||||
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
|
||||
//! The per-timeline layer eviction task.
|
||||
|
||||
use std::{
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use either::Either;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
@@ -34,11 +21,6 @@ use crate::{
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTimelineState {
|
||||
last_refresh_required_in_restart: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -72,10 +54,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
|
||||
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
@@ -96,8 +77,7 @@ impl Timeline {
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<(), Instant> {
|
||||
debug!("eviction iteration: {policy:?}");
|
||||
match policy {
|
||||
@@ -107,7 +87,7 @@ impl Timeline {
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
let start = Instant::now();
|
||||
match self.eviction_iteration_threshold(p, cancel, ctx).await {
|
||||
match self.eviction_iteration_threshold(p, cancel).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
@@ -121,8 +101,7 @@ impl Timeline {
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
@@ -135,28 +114,6 @@ impl Timeline {
|
||||
not_evictable: usize,
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
|
||||
// what we want is to invalidate any caches which haven't been accessed for `p.threshold`,
|
||||
// but we cannot actually do it for current limitations except by restarting pageserver. we
|
||||
// just recompute the values which would be recomputed on startup.
|
||||
//
|
||||
// for active tenants this will likely materialized page cache or in-memory layers. for
|
||||
// inactive tenants it will refresh the last_access timestamps so that we will not evict
|
||||
// and re-download on restart these layers.
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
match state.last_refresh_required_in_restart {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.refresh_layers_required_in_restart(cancel, ctx).await;
|
||||
state.last_refresh_required_in_restart = Some(tokio::time::Instant::now())
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
let mut stats = EvictionStats::default();
|
||||
// Gather layers for eviction.
|
||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||
@@ -169,7 +126,13 @@ impl Timeline {
|
||||
if hist_layer.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity();
|
||||
let last_activity_ts = match hist_layer
|
||||
.access_stats()
|
||||
.most_recent_access_or_residence_event()
|
||||
{
|
||||
Either::Left(mra) => mra.when,
|
||||
Either::Right(re) => re.timestamp,
|
||||
};
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
@@ -211,7 +174,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let results = match self
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel.clone())
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel)
|
||||
.await
|
||||
{
|
||||
Err(pre_err) => {
|
||||
@@ -253,40 +216,4 @@ impl Timeline {
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
/// Recompute the values which would cause on-demand downloads during restart.
|
||||
async fn refresh_layers_required_in_restart(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
// imitiate on-restart initial logical size
|
||||
let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
|
||||
|
||||
match &size {
|
||||
Ok(_size) => {
|
||||
// good, don't log it to avoid confusion
|
||||
}
|
||||
Err(_) => {
|
||||
// we have known issues for which we already log this on consumption metrics,
|
||||
// gc, and compaction. leave logging out for now.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/2539
|
||||
}
|
||||
}
|
||||
|
||||
// imitiate repartiting on first compactation
|
||||
if let Err(e) = self.collect_keyspace(lsn, ctx).await {
|
||||
// if this failed, we probably failed logical size because these use the same keys
|
||||
if size.is_err() {
|
||||
// ignore, see above comment
|
||||
} else {
|
||||
warn!(
|
||||
"failed to collect keyspace but succeeded in calculating logical size: {e:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,7 +45,6 @@ pub fn spawn_connection_manager_task(
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
auth_token: Option<Arc<String>>,
|
||||
availability_zone: Option<String>,
|
||||
ctx: RequestContext,
|
||||
) {
|
||||
let mut broker_client = get_broker_client().clone();
|
||||
@@ -68,7 +67,6 @@ pub fn spawn_connection_manager_task(
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
auth_token,
|
||||
availability_zone,
|
||||
);
|
||||
loop {
|
||||
select! {
|
||||
@@ -336,7 +334,6 @@ struct WalreceiverState {
|
||||
/// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
|
||||
wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
|
||||
auth_token: Option<Arc<String>>,
|
||||
availability_zone: Option<String>,
|
||||
}
|
||||
|
||||
/// Current connection data.
|
||||
@@ -384,7 +381,6 @@ impl WalreceiverState {
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
auth_token: Option<Arc<String>>,
|
||||
availability_zone: Option<String>,
|
||||
) -> Self {
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: timeline.tenant_id,
|
||||
@@ -400,7 +396,6 @@ impl WalreceiverState {
|
||||
wal_stream_candidates: HashMap::new(),
|
||||
wal_connection_retries: HashMap::new(),
|
||||
auth_token,
|
||||
availability_zone,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -745,7 +740,6 @@ impl WalreceiverState {
|
||||
None => None,
|
||||
Some(x) => Some(x),
|
||||
},
|
||||
self.availability_zone.as_deref(),
|
||||
) {
|
||||
Ok(connstr) => Some((*sk_id, info, connstr)),
|
||||
Err(e) => {
|
||||
@@ -830,24 +824,17 @@ fn wal_stream_connection_config(
|
||||
}: TenantTimelineId,
|
||||
listen_pg_addr_str: &str,
|
||||
auth_token: Option<&str>,
|
||||
availability_zone: Option<&str>,
|
||||
) -> anyhow::Result<PgConnectionConfig> {
|
||||
let (host, port) =
|
||||
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
let port = port.unwrap_or(5432);
|
||||
let mut connstr = PgConnectionConfig::new_host_port(host, port)
|
||||
Ok(PgConnectionConfig::new_host_port(host, port)
|
||||
.extend_options([
|
||||
"-c".to_owned(),
|
||||
format!("timeline_id={}", timeline_id),
|
||||
format!("tenant_id={}", tenant_id),
|
||||
])
|
||||
.set_password(auth_token.map(|s| s.to_owned()));
|
||||
|
||||
if let Some(availability_zone) = availability_zone {
|
||||
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
|
||||
}
|
||||
|
||||
Ok(connstr)
|
||||
.set_password(auth_token.map(|s| s.to_owned())))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -1286,7 +1273,6 @@ mod tests {
|
||||
wal_stream_candidates: HashMap::new(),
|
||||
wal_connection_retries: HashMap::new(),
|
||||
auth_token: None,
|
||||
availability_zone: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,21 +127,12 @@ impl UploadQueue {
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
for layer_name in &index_part.timeline_layers {
|
||||
match index_part
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
{
|
||||
Some(layer_metadata) => {
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
None => {
|
||||
anyhow::bail!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
layer_name.file_name()
|
||||
);
|
||||
}
|
||||
}
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
|
||||
@@ -46,12 +46,8 @@ PGconn *pageserver_conn = NULL;
|
||||
*/
|
||||
WaitEventSet *pageserver_conn_wes = NULL;
|
||||
|
||||
/* GUCs */
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
int32 max_cluster_size;
|
||||
char *page_server_connstring;
|
||||
char *neon_auth_token;
|
||||
char *page_server_connstring_raw;
|
||||
char *safekeeper_token_env;
|
||||
|
||||
int n_unflushed_requests = 0;
|
||||
int flush_every_n_requests = 8;
|
||||
@@ -64,37 +60,10 @@ pageserver_connect(int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = page_server_connstring;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pageserver_conn = PQconnectdbParams(keywords, values, 1);
|
||||
pageserver_conn = PQconnectdb(page_server_connstring);
|
||||
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
@@ -156,7 +125,7 @@ pageserver_connect(int elevel)
|
||||
}
|
||||
}
|
||||
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
|
||||
|
||||
connected = true;
|
||||
return true;
|
||||
@@ -385,6 +354,105 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
static char *
|
||||
substitute_pageserver_password(const char *page_server_connstring_raw)
|
||||
{
|
||||
char *host = NULL;
|
||||
char *port = NULL;
|
||||
char *user = NULL;
|
||||
char *auth_token = NULL;
|
||||
char *err = NULL;
|
||||
char *page_server_connstring = NULL;
|
||||
PQconninfoOption *conn_options;
|
||||
PQconninfoOption *conn_option;
|
||||
MemoryContext oldcontext;
|
||||
|
||||
/*
|
||||
* Here we substitute password in connection string with an environment
|
||||
* variable. To simplify things we construct a connection string back with
|
||||
* only known options. In particular: host port user and password. We do
|
||||
* not currently use other options and constructing full connstring in an
|
||||
* URI shape is quite messy.
|
||||
*/
|
||||
|
||||
if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
|
||||
return NULL;
|
||||
|
||||
/* extract the auth token from the connection string */
|
||||
conn_options = PQconninfoParse(page_server_connstring_raw, &err);
|
||||
if (conn_options == NULL)
|
||||
{
|
||||
/* The error string is malloc'd, so we must free it explicitly */
|
||||
char *errcopy = err ? pstrdup(err) : "out of memory";
|
||||
|
||||
PQfreemem(err);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid connection string syntax: %s", errcopy)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Trying to populate pageserver connection string with auth token from
|
||||
* environment. We are looking for password in with placeholder value like
|
||||
* $ENV_VAR_NAME, so if password field is present and starts with $ we try
|
||||
* to fetch environment variable value and fail loudly if it is not set.
|
||||
*/
|
||||
for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
|
||||
{
|
||||
if (strcmp(conn_option->keyword, "host") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
host = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "port") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
port = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "user") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
user = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "password") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
{
|
||||
/* ensure that this is a template */
|
||||
if (strncmp(conn_option->val, "$", 1) != 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
|
||||
|
||||
neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
|
||||
auth_token = getenv(&conn_option->val[1]);
|
||||
if (!auth_token)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_log(LOG, "using auth token from environment passed via env");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* allocate connection string in TopMemoryContext to make sure it is not
|
||||
* freed
|
||||
*/
|
||||
oldcontext = CurrentMemoryContext;
|
||||
MemoryContextSwitchTo(TopMemoryContext);
|
||||
page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
PQconninfoFree(conn_options);
|
||||
return page_server_connstring;
|
||||
}
|
||||
|
||||
/*
|
||||
* Module initialization function
|
||||
*/
|
||||
@@ -394,12 +462,21 @@ pg_init_libpagestore(void)
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
&page_server_connstring,
|
||||
&page_server_connstring_raw,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.safekeeper_token_env",
|
||||
"the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
|
||||
NULL,
|
||||
&safekeeper_token_env,
|
||||
NULL,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.timeline_id",
|
||||
"Neon timeline_id the server is running on",
|
||||
NULL,
|
||||
@@ -456,10 +533,30 @@ pg_init_libpagestore(void)
|
||||
neon_log(PageStoreTrace, "libpagestore already loaded");
|
||||
page_server = &api;
|
||||
|
||||
/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
|
||||
neon_auth_token = getenv("NEON_AUTH_TOKEN");
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
/* substitute password in pageserver_connstring */
|
||||
page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
|
||||
|
||||
/* Is there more correct way to pass CustomGUC to postgres code? */
|
||||
neon_timeline_walproposer = neon_timeline;
|
||||
neon_tenant_walproposer = neon_tenant;
|
||||
|
||||
/* retrieve the token for Safekeeper, if present */
|
||||
if (safekeeper_token_env != NULL) {
|
||||
if (safekeeper_token_env[0] != '$') {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s",
|
||||
safekeeper_token_env)));
|
||||
}
|
||||
neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]);
|
||||
if (!neon_safekeeper_token_walproposer) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("cannot get safekeeper auth token, environment variable %s is not set",
|
||||
&safekeeper_token_env[1])));
|
||||
}
|
||||
neon_log(LOG, "using safekeeper auth token from environment variable");
|
||||
}
|
||||
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
{
|
||||
|
||||
@@ -51,39 +51,12 @@ walprop_status(WalProposerConn *conn)
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
walprop_connect_start(char *conninfo, char *password)
|
||||
walprop_connect_start(char *conninfo)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the
|
||||
* NEON_AUTH_TOKEN environment variable was set, use that as
|
||||
* the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (password)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pg_conn = PQconnectStartParams(keywords, values, 1);
|
||||
pg_conn = PQconnectStart(conninfo);
|
||||
|
||||
/*
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully
|
||||
|
||||
@@ -12,11 +12,6 @@
|
||||
#ifndef NEON_H
|
||||
#define NEON_H
|
||||
|
||||
/* GUCs */
|
||||
extern char *neon_auth_token;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
|
||||
@@ -92,6 +92,14 @@ const int SmgrTrace = DEBUG5;
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* GUCs */
|
||||
char *page_server_connstring;
|
||||
|
||||
/*with substituted password*/
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
int32 max_cluster_size;
|
||||
|
||||
/* unlogged relation build states */
|
||||
typedef enum
|
||||
{
|
||||
|
||||
@@ -78,6 +78,10 @@ int wal_acceptor_reconnect_timeout;
|
||||
int wal_acceptor_connection_timeout;
|
||||
bool am_wal_proposer;
|
||||
|
||||
char *neon_timeline_walproposer = NULL;
|
||||
char *neon_tenant_walproposer = NULL;
|
||||
char *neon_safekeeper_token_walproposer = NULL;
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
static int n_safekeepers = 0;
|
||||
@@ -510,9 +514,17 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
Safekeeper *sk = &safekeeper[n_safekeepers];
|
||||
int written = 0;
|
||||
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_timeline, neon_tenant);
|
||||
if (neon_safekeeper_token_walproposer != NULL) {
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer,
|
||||
neon_tenant_walproposer);
|
||||
} else {
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer);
|
||||
}
|
||||
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
@@ -538,16 +550,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
|
||||
greetRequest.systemId = systemId;
|
||||
if (!neon_timeline)
|
||||
if (!neon_timeline_walproposer)
|
||||
elog(FATAL, "neon.timeline_id is not provided");
|
||||
if (*neon_timeline != '\0' &&
|
||||
!HexDecodeString(greetRequest.timeline_id, neon_timeline, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline);
|
||||
if (!neon_tenant)
|
||||
if (*neon_timeline_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
|
||||
if (!neon_tenant_walproposer)
|
||||
elog(FATAL, "neon.tenant_id is not provided");
|
||||
if (*neon_tenant != '\0' &&
|
||||
!HexDecodeString(greetRequest.tenant_id, neon_tenant, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant);
|
||||
if (*neon_tenant_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* FIXME don't use hardcoded timeline id */
|
||||
@@ -688,7 +700,7 @@ ResetConnection(Safekeeper *sk)
|
||||
/*
|
||||
* Try to establish new connection
|
||||
*/
|
||||
sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token);
|
||||
sk->conn = walprop_connect_start((char *) &sk->conninfo);
|
||||
|
||||
/*
|
||||
* "If the result is null, then libpq has been unable to allocate a new
|
||||
|
||||
@@ -39,6 +39,10 @@ typedef struct WalProposerConn WalProposerConn;
|
||||
struct WalMessage;
|
||||
typedef struct WalMessage WalMessage;
|
||||
|
||||
extern char *neon_timeline_walproposer;
|
||||
extern char *neon_tenant_walproposer;
|
||||
extern char *neon_safekeeper_token_walproposer;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
@@ -454,7 +458,7 @@ extern char *walprop_error_message(WalProposerConn *conn);
|
||||
extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQconnectStart */
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo);
|
||||
|
||||
/* Re-exported PQconectPoll */
|
||||
extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
|
||||
|
||||
@@ -209,17 +209,9 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
if let Some(tls) = tls.take() {
|
||||
// Upgrade raw stream into a secure TLS-backed stream.
|
||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||
|
||||
let (raw, read_buf) = stream.into_inner();
|
||||
// TODO: Normally, client doesn't send any data before
|
||||
// server says TLS handshake is ok and read_buf is empy.
|
||||
// However, you could imagine pipelining of postgres
|
||||
// SSLRequest + TLS ClientHello in one hunk similar to
|
||||
// pipelining in our node js driver. We should probably
|
||||
// support that by chaining read_buf with the stream.
|
||||
if !read_buf.is_empty() {
|
||||
bail!("data is sent before server replied with EncryptionResponse");
|
||||
}
|
||||
// Client shouldn't send any data before TLS handshake,
|
||||
// so ignore pqstream read buffer.
|
||||
let (raw, _) = stream.into_inner();
|
||||
stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +71,6 @@ struct Args {
|
||||
/// Listen http endpoint for management and metrics in the form host:port.
|
||||
#[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
|
||||
listen_http: String,
|
||||
/// Availability zone of the safekeeper.
|
||||
#[arg(long)]
|
||||
availability_zone: Option<String>,
|
||||
/// Do not wait for changes to be written safely to disk. Unsafe.
|
||||
#[arg(short, long)]
|
||||
no_sync: bool,
|
||||
@@ -111,7 +108,7 @@ struct Args {
|
||||
/// WAL backup horizon.
|
||||
#[arg(long)]
|
||||
disable_wal_backup: bool,
|
||||
/// Path to a .pem public key which is used to check JWT tokens.
|
||||
/// Path to an RSA .pem public key which is used to check JWT tokens.
|
||||
#[arg(long)]
|
||||
auth_validation_public_key_path: Option<PathBuf>,
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
@@ -169,7 +166,6 @@ fn main() -> anyhow::Result<()> {
|
||||
my_id: id,
|
||||
listen_pg_addr: args.listen_pg,
|
||||
listen_http_addr: args.listen_http,
|
||||
availability_zone: args.availability_zone,
|
||||
no_sync: args.no_sync,
|
||||
broker_endpoint: args.broker_endpoint,
|
||||
broker_keepalive_interval: args.broker_keepalive_interval,
|
||||
|
||||
@@ -9,7 +9,6 @@ use tracing::{info, info_span, Instrument};
|
||||
use crate::auth::check_permission;
|
||||
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
||||
|
||||
use crate::metrics::TrafficMetrics;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
use postgres_backend::QueryError;
|
||||
@@ -34,7 +33,6 @@ pub struct SafekeeperPostgresHandler {
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
claims: Option<Claims>,
|
||||
io_metrics: Option<TrafficMetrics>,
|
||||
}
|
||||
|
||||
/// Parsed Postgres command.
|
||||
@@ -96,11 +94,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
format!("Failed to parse {value} as timeline id")
|
||||
})?);
|
||||
}
|
||||
Some(("availability_zone", client_az)) => {
|
||||
if let Some(metrics) = self.io_metrics.as_ref() {
|
||||
metrics.set_client_az(client_az)
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
@@ -108,9 +101,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.appname = Some(app_name.to_owned());
|
||||
if let Some(metrics) = self.io_metrics.as_ref() {
|
||||
metrics.set_app_name(app_name)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -197,7 +187,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
}
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(conf: SafeKeeperConf, conn_id: u32, io_metrics: Option<TrafficMetrics>) -> Self {
|
||||
pub fn new(conf: SafeKeeperConf, conn_id: u32) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
appname: None,
|
||||
@@ -206,7 +196,6 @@ impl SafekeeperPostgresHandler {
|
||||
ttid: TenantTimelineId::empty(),
|
||||
conn_id,
|
||||
claims: None,
|
||||
io_metrics,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,7 +52,6 @@ pub struct SafeKeeperConf {
|
||||
pub my_id: NodeId,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub availability_zone: Option<String>,
|
||||
pub no_sync: bool,
|
||||
pub broker_endpoint: Uri,
|
||||
pub broker_keepalive_interval: Duration,
|
||||
@@ -83,7 +82,6 @@ impl SafeKeeperConf {
|
||||
no_sync: false,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
availability_zone: None,
|
||||
remote_storage: None,
|
||||
my_id: NodeId(0),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT
|
||||
|
||||
@@ -1,19 +1,15 @@
|
||||
//! Global safekeeper mertics and per-timeline safekeeper metrics.
|
||||
|
||||
use std::{
|
||||
sync::{Arc, RwLock},
|
||||
time::{Instant, SystemTime},
|
||||
};
|
||||
use std::time::{Instant, SystemTime};
|
||||
|
||||
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use anyhow::Result;
|
||||
use metrics::{
|
||||
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
|
||||
core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts},
|
||||
proto::MetricFamily,
|
||||
register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
@@ -67,132 +63,13 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
|
||||
});
|
||||
pub static PG_IO_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
"safekeeper_pg_io_bytes",
|
||||
"Bytes read from or written to any PostgreSQL connection",
|
||||
&["client_az", "sk_az", "app_name", "dir", "same_az"]
|
||||
&["direction"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_pg_io_bytes gauge")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
/// Labels for traffic metrics.
|
||||
#[derive(Clone)]
|
||||
struct ConnectionLabels {
|
||||
/// Availability zone of the connection origin.
|
||||
client_az: String,
|
||||
/// Availability zone of the current safekeeper.
|
||||
sk_az: String,
|
||||
/// Client application name.
|
||||
app_name: String,
|
||||
}
|
||||
|
||||
impl ConnectionLabels {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
client_az: LABEL_UNKNOWN.to_string(),
|
||||
sk_az: LABEL_UNKNOWN.to_string(),
|
||||
app_name: LABEL_UNKNOWN.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_metrics(
|
||||
&self,
|
||||
) -> (
|
||||
GenericCounter<metrics::core::AtomicU64>,
|
||||
GenericCounter<metrics::core::AtomicU64>,
|
||||
) {
|
||||
let same_az = match (self.client_az.as_str(), self.sk_az.as_str()) {
|
||||
(LABEL_UNKNOWN, _) | (_, LABEL_UNKNOWN) => LABEL_UNKNOWN,
|
||||
(client_az, sk_az) => {
|
||||
if client_az == sk_az {
|
||||
"true"
|
||||
} else {
|
||||
"false"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let read = PG_IO_BYTES.with_label_values(&[
|
||||
&self.client_az,
|
||||
&self.sk_az,
|
||||
&self.app_name,
|
||||
"read",
|
||||
same_az,
|
||||
]);
|
||||
let write = PG_IO_BYTES.with_label_values(&[
|
||||
&self.client_az,
|
||||
&self.sk_az,
|
||||
&self.app_name,
|
||||
"write",
|
||||
same_az,
|
||||
]);
|
||||
(read, write)
|
||||
}
|
||||
}
|
||||
|
||||
struct TrafficMetricsState {
|
||||
/// Labels for traffic metrics.
|
||||
labels: ConnectionLabels,
|
||||
/// Total bytes read from this connection.
|
||||
read: GenericCounter<metrics::core::AtomicU64>,
|
||||
/// Total bytes written to this connection.
|
||||
write: GenericCounter<metrics::core::AtomicU64>,
|
||||
}
|
||||
|
||||
/// Metrics for measuring traffic (r/w bytes) in a single PostgreSQL connection.
|
||||
#[derive(Clone)]
|
||||
pub struct TrafficMetrics {
|
||||
state: Arc<RwLock<TrafficMetricsState>>,
|
||||
}
|
||||
|
||||
impl Default for TrafficMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TrafficMetrics {
|
||||
pub fn new() -> Self {
|
||||
let labels = ConnectionLabels::new();
|
||||
let (read, write) = labels.build_metrics();
|
||||
let state = TrafficMetricsState {
|
||||
labels,
|
||||
read,
|
||||
write,
|
||||
};
|
||||
Self {
|
||||
state: Arc::new(RwLock::new(state)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_client_az(&self, value: &str) {
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.labels.client_az = value.to_string();
|
||||
(state.read, state.write) = state.labels.build_metrics();
|
||||
}
|
||||
|
||||
pub fn set_sk_az(&self, value: &str) {
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.labels.sk_az = value.to_string();
|
||||
(state.read, state.write) = state.labels.build_metrics();
|
||||
}
|
||||
|
||||
pub fn set_app_name(&self, value: &str) {
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.labels.app_name = value.to_string();
|
||||
(state.read, state.write) = state.labels.build_metrics();
|
||||
}
|
||||
|
||||
pub fn observe_read(&self, cnt: usize) {
|
||||
self.state.read().unwrap().read.inc_by(cnt as u64)
|
||||
}
|
||||
|
||||
pub fn observe_write(&self, cnt: usize) {
|
||||
self.state.read().unwrap().write.inc_by(cnt as u64)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for WalStorage in a single timeline.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct WalStorageMetrics {
|
||||
|
||||
@@ -9,9 +9,8 @@ use tokio::net::TcpStream;
|
||||
use tracing::*;
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::TrafficMetrics;
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{handler::SafekeeperPostgresHandler, metrics::PG_IO_BYTES};
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
@@ -69,21 +68,20 @@ fn handle_socket(
|
||||
.build()?;
|
||||
let local = tokio::task::LocalSet::new();
|
||||
|
||||
let read_metrics = PG_IO_BYTES.with_label_values(&["read"]);
|
||||
let write_metrics = PG_IO_BYTES.with_label_values(&["write"]);
|
||||
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
let traffic_metrics = TrafficMetrics::new();
|
||||
if let Some(current_az) = conf.availability_zone.as_deref() {
|
||||
traffic_metrics.set_sk_az(current_az);
|
||||
}
|
||||
|
||||
// TODO: measure cross-az traffic
|
||||
let socket = MeasuredStream::new(
|
||||
socket,
|
||||
|cnt| {
|
||||
traffic_metrics.observe_read(cnt);
|
||||
read_metrics.inc_by(cnt as u64);
|
||||
},
|
||||
|cnt| {
|
||||
traffic_metrics.observe_write(cnt);
|
||||
write_metrics.inc_by(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
@@ -91,8 +89,7 @@ fn handle_socket(
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(conf, conn_id);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
|
||||
@@ -78,7 +78,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"pageserver_tenant_states_count",
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
)
|
||||
|
||||
@@ -431,7 +431,7 @@ class AuthKeys:
|
||||
priv: str
|
||||
|
||||
def generate_token(self, *, scope: str, **token_data: str) -> str:
|
||||
token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
|
||||
token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="RS256")
|
||||
# cast(Any, self.priv)
|
||||
|
||||
# jwt.encode can return 'bytes' or 'str', depending on Python version or type
|
||||
@@ -643,7 +643,6 @@ class NeonEnvBuilder:
|
||||
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
|
||||
)
|
||||
initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
|
||||
env.initial_timeline = initial_timeline
|
||||
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
|
||||
|
||||
return env
|
||||
@@ -905,7 +904,6 @@ class NeonEnv:
|
||||
# generate initial tenant ID here instead of letting 'neon init' generate it,
|
||||
# so that we don't need to dig it out of the config file afterwards.
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline: Optional[TimelineId] = None
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
toml = textwrap.dedent(
|
||||
@@ -926,8 +924,7 @@ class NeonEnv:
|
||||
pg=self.port_distributor.get_port(),
|
||||
http=self.port_distributor.get_port(),
|
||||
)
|
||||
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
|
||||
toml += textwrap.dedent(
|
||||
f"""
|
||||
@@ -935,8 +932,7 @@ class NeonEnv:
|
||||
id=1
|
||||
listen_pg_addr = 'localhost:{pageserver_port.pg}'
|
||||
listen_http_addr = 'localhost:{pageserver_port.http}'
|
||||
pg_auth_type = '{pg_auth_type}'
|
||||
http_auth_type = '{http_auth_type}'
|
||||
auth_type = '{pageserver_auth_type}'
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -1119,9 +1115,7 @@ def neon_env_builder(
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
def __init__(self, message, status_code: int):
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
pass
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
@@ -1142,7 +1136,7 @@ class PageserverHttpClient(requests.Session):
|
||||
msg = res.json()["msg"]
|
||||
except: # noqa: E722
|
||||
msg = ""
|
||||
raise PageserverApiException(msg, res.status_code) from e
|
||||
raise PageserverApiException(msg) from e
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
@@ -1192,12 +1186,8 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
|
||||
params = {}
|
||||
if detach_ignored:
|
||||
params["detach_ignored"] = "true"
|
||||
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
@@ -1220,28 +1210,6 @@ class PageserverHttpClient(requests.Session):
|
||||
self.verbose_error(res)
|
||||
return TenantConfig.from_json(res.json())
|
||||
|
||||
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/config",
|
||||
json={**config, "tenant_id": str(tenant_id)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def patch_tenant_config_client_side(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
inserts: Optional[Dict[str, Any]] = None,
|
||||
removes: Optional[List[str]] = None,
|
||||
):
|
||||
current = self.tenant_config(tenant_id).tenant_specific_overrides
|
||||
if inserts is not None:
|
||||
current.update(inserts)
|
||||
if removes is not None:
|
||||
for key in removes:
|
||||
del current[key]
|
||||
self.set_tenant_config(tenant_id, current)
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
@@ -1558,18 +1526,6 @@ class PageserverHttpClient(requests.Session):
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def disk_usage_eviction_run(self, request: dict[str, Any]):
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
|
||||
json=request,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def tenant_break(self, tenant_id: TenantId):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TenantConfig:
|
||||
@@ -2128,8 +2084,8 @@ class NeonPageserver(PgProtocol):
|
||||
# https://github.com/neondatabase/neon/issues/2442
|
||||
".*could not remove ephemeral file.*No such file or directory.*",
|
||||
# FIXME: These need investigation
|
||||
".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
|
||||
".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
|
||||
".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
|
||||
".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
|
||||
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
|
||||
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# pg_clients
|
||||
|
||||
To run a single test locally:
|
||||
|
||||
```bash
|
||||
export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
|
||||
|
||||
# will filter only tests with "serverless" in the name
|
||||
./scripts/pytest -m remote_cluster -k serverless
|
||||
```
|
||||
@@ -389,9 +389,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.48"
|
||||
version = "0.10.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "518915b97df115dd36109bfa429a48b8f737bd05508cf9588977b599648926d2"
|
||||
checksum = "b102428fd03bc5edf97f62620f7298614c45cedf287c271e7ed450bbaf83f2e1"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -421,9 +421,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.83"
|
||||
version = "0.9.80"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "666416d899cf077260dac8698d60a60b435a46d57e82acb1be3d0dad87284e5b"
|
||||
checksum = "23bbbf7854cd45b83958ebe919f0e8e516793727652e27fda10a8384cfc790b7"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cc",
|
||||
|
||||
@@ -22,7 +22,6 @@ from fixtures.utils import subprocess_capture
|
||||
),
|
||||
"swift/PostgresNIOExample",
|
||||
"typescript/postgresql-client",
|
||||
"typescript/serverless-driver",
|
||||
],
|
||||
)
|
||||
def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str):
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
node_modules/
|
||||
@@ -1 +0,0 @@
|
||||
node_modules/
|
||||
@@ -1,7 +0,0 @@
|
||||
FROM node:18
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
RUN npm clean-install
|
||||
|
||||
CMD ["/source/index.js"]
|
||||
@@ -1,20 +0,0 @@
|
||||
#! /usr/bin/env node
|
||||
|
||||
import { Client } from '@neondatabase/serverless'
|
||||
|
||||
(async () => {
|
||||
const client = new Client({
|
||||
host: process.env.NEON_HOST,
|
||||
database: process.env.NEON_DATABASE,
|
||||
user: process.env.NEON_USER,
|
||||
password: process.env.NEON_PASSWORD,
|
||||
});
|
||||
client.connect();
|
||||
const result = await client.query({
|
||||
text: 'select 1',
|
||||
rowMode: 'array',
|
||||
});
|
||||
const rows = result.rows;
|
||||
await client.end();
|
||||
console.log(rows[0][0]);
|
||||
})()
|
||||
@@ -1,51 +0,0 @@
|
||||
{
|
||||
"name": "serverless-driver",
|
||||
"lockfileVersion": 2,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": "^0.2.8",
|
||||
"ws": "^8.13.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@neondatabase/serverless": {
|
||||
"version": "0.2.8",
|
||||
"resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz",
|
||||
"integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA=="
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.13.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
|
||||
"integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": {
|
||||
"version": "0.2.8",
|
||||
"resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz",
|
||||
"integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA=="
|
||||
},
|
||||
"ws": {
|
||||
"version": "8.13.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
|
||||
"integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
|
||||
"requires": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": "^0.2.8",
|
||||
"ws": "^8.13.0"
|
||||
}
|
||||
}
|
||||
@@ -246,13 +246,6 @@ def prepare_snapshot(
|
||||
if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
|
||||
pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version
|
||||
|
||||
# Older pageserver versions had just one `auth_type` setting. Now there
|
||||
# are separate settings for pg and http ports. We don't use authentication
|
||||
# in compatibility tests so just remove authentication related settings.
|
||||
pageserver_config.pop("auth_type", None)
|
||||
pageserver_config.pop("pg_auth_type", None)
|
||||
pageserver_config.pop("http_auth_type", None)
|
||||
|
||||
if pg_distrib_dir:
|
||||
pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
|
||||
|
||||
@@ -365,9 +358,11 @@ def check_neon_works(
|
||||
tenant_id = snapshot_config["default_tenant_id"]
|
||||
timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
|
||||
pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
|
||||
auth_token = snapshot_config["pageserver"]["auth_token"]
|
||||
pageserver_http = PageserverHttpClient(
|
||||
port=pageserver_port,
|
||||
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
|
||||
auth_token=auth_token,
|
||||
)
|
||||
|
||||
shutil.rmtree(repo_dir / "local_fs_remote_storage")
|
||||
|
||||
@@ -1,541 +0,0 @@
|
||||
import shutil
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
LocalFsStorage,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
PgBin,
|
||||
RemoteStorageKind,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("config_level_override", [None, 400])
|
||||
def test_min_resident_size_override_handling(
|
||||
neon_env_builder: NeonEnvBuilder, config_level_override: int
|
||||
):
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
def assert_config(tenant_id, expect_override, expect_effective):
|
||||
config = ps_http.tenant_config(tenant_id)
|
||||
assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
|
||||
assert config.effective_config.get("min_resident_size_override") == expect_effective
|
||||
|
||||
def assert_overrides(tenant_id, default_tenant_conf_value):
|
||||
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
|
||||
assert_config(tenant_id, 200, 200)
|
||||
|
||||
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
|
||||
assert_config(tenant_id, 0, 0)
|
||||
|
||||
ps_http.set_tenant_config(tenant_id, {})
|
||||
assert_config(tenant_id, None, default_tenant_conf_value)
|
||||
|
||||
env.pageserver.stop()
|
||||
if config_level_override is not None:
|
||||
env.pageserver.start(
|
||||
overrides=(
|
||||
"--pageserver-config-override=tenant_config={ min_resident_size_override = "
|
||||
+ str(config_level_override)
|
||||
+ " }",
|
||||
)
|
||||
)
|
||||
else:
|
||||
env.pageserver.start()
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
assert_overrides(tenant_id, config_level_override)
|
||||
|
||||
# Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
|
||||
tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"})
|
||||
assert_config(tenant_id, 100, 100)
|
||||
ps_http.set_tenant_config(tenant_id, {})
|
||||
assert_config(tenant_id, None, config_level_override)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvictionEnv:
|
||||
timelines: list[Tuple[TenantId, TimelineId]]
|
||||
neon_env: NeonEnv
|
||||
pg_bin: PgBin
|
||||
pageserver_http: PageserverHttpClient
|
||||
layer_size: int
|
||||
pgbench_init_lsns: Dict[TenantId, Lsn]
|
||||
|
||||
def timelines_du(self) -> Tuple[int, int, int]:
|
||||
return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
|
||||
|
||||
def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
|
||||
return {
|
||||
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
|
||||
for tid, tlid in self.timelines
|
||||
}
|
||||
|
||||
def warm_up_tenant(self, tenant_id: TenantId):
|
||||
"""
|
||||
Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it.
|
||||
This assumes that the tenant is still at the state after pbench -i.
|
||||
"""
|
||||
lsn = self.pgbench_init_lsns[tenant_id]
|
||||
with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg:
|
||||
self.pg_bin.run(["pgbench", "-S", pg.connstr()])
|
||||
|
||||
def pageserver_start_with_disk_usage_eviction(
|
||||
self, period, max_usage_pct, min_avail_bytes, mock_behavior
|
||||
):
|
||||
disk_usage_config = {
|
||||
"period": period,
|
||||
"max_usage_pct": max_usage_pct,
|
||||
"min_avail_bytes": min_avail_bytes,
|
||||
"mock_statvfs": mock_behavior,
|
||||
}
|
||||
|
||||
enc = toml.TomlEncoder()
|
||||
|
||||
self.neon_env.pageserver.start(
|
||||
overrides=(
|
||||
"--pageserver-config-override=disk_usage_based_eviction="
|
||||
+ enc.dump_inline_table(disk_usage_config).replace("\n", " "),
|
||||
),
|
||||
)
|
||||
|
||||
def statvfs_called():
|
||||
assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*")
|
||||
|
||||
wait_until(10, 1, statvfs_called)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
|
||||
"""
|
||||
Creates two tenants, one somewhat larger than the other.
|
||||
"""
|
||||
|
||||
log.info(f"setting up eviction_env for test {request.node.name}")
|
||||
|
||||
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# allow because we are invoking this manually; we always warn on executing disk based eviction
|
||||
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
|
||||
|
||||
# remove the initial tenant
|
||||
## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
|
||||
assert env.initial_timeline
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline)
|
||||
pageserver_http.tenant_detach(env.initial_tenant)
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
|
||||
assert tenant_remote_storage.is_dir()
|
||||
shutil.rmtree(tenant_remote_storage)
|
||||
env.initial_tenant = TenantId("0" * 32)
|
||||
env.initial_timeline = None
|
||||
|
||||
# Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
|
||||
# Large count of layers and small layer size is good for testing because it makes evictions predictable.
|
||||
# Predictable in the sense that many layer evictions will be required to reach the eviction target, because
|
||||
# each eviction only makes small progress. That means little overshoot, and thereby stable asserts.
|
||||
pgbench_scales = [4, 6]
|
||||
layer_size = 5 * 1024**2
|
||||
|
||||
pgbench_init_lsns = {}
|
||||
|
||||
timelines = []
|
||||
for scale in pgbench_scales:
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{layer_size}",
|
||||
"image_creation_threshold": "100",
|
||||
"compaction_target_size": f"{layer_size}",
|
||||
}
|
||||
)
|
||||
|
||||
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
|
||||
pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()])
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
timelines.append((tenant_id, timeline_id))
|
||||
|
||||
# stop the safekeepers to avoid on-demand downloads caused by
|
||||
# initial logical size calculation triggered by walreceiver connection status
|
||||
# when we restart the pageserver process in any of the tests
|
||||
env.neon_cli.safekeeper_stop()
|
||||
|
||||
# after stopping the safekeepers, we know that no new WAL will be coming in
|
||||
for tenant_id, timeline_id in timelines:
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id)
|
||||
tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
|
||||
assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
|
||||
pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info(f"{layers}")
|
||||
assert (
|
||||
len(layers.historic_layers) >= 10
|
||||
), "evictions happen at layer granularity, but we often assert at byte-granularity"
|
||||
|
||||
eviction_env = EvictionEnv(
|
||||
timelines=timelines,
|
||||
neon_env=env,
|
||||
pageserver_http=pageserver_http,
|
||||
layer_size=layer_size,
|
||||
pg_bin=pg_bin,
|
||||
pgbench_init_lsns=pgbench_init_lsns,
|
||||
)
|
||||
|
||||
return eviction_env
|
||||
|
||||
|
||||
def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
|
||||
env = eviction_env
|
||||
|
||||
env.neon_env.pageserver.allowed_errors.append(
|
||||
r".* Changing Active tenant to Broken state, reason: broken from test"
|
||||
)
|
||||
broken_tenant_id, broken_timeline_id = env.timelines[0]
|
||||
env.pageserver_http.tenant_break(broken_tenant_id)
|
||||
|
||||
healthy_tenant_id, healthy_timeline_id = env.timelines[1]
|
||||
|
||||
broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
|
||||
healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
|
||||
|
||||
# try to evict everything, then validate that broken tenant wasn't touched
|
||||
target = broken_size_pre + healthy_size_pre
|
||||
|
||||
response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
log.info(f"{response}")
|
||||
|
||||
broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
|
||||
healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
|
||||
|
||||
assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
|
||||
assert healthy_size_post < healthy_size_pre
|
||||
assert healthy_size_post == 0
|
||||
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
|
||||
|
||||
|
||||
def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
|
||||
"""
|
||||
Basic test to ensure that we evict enough to relieve pressure.
|
||||
"""
|
||||
env = eviction_env
|
||||
pageserver_http = env.pageserver_http
|
||||
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
|
||||
target = total_on_disk // 2
|
||||
|
||||
response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
|
||||
actual_change = total_on_disk - later_total_on_disk
|
||||
|
||||
assert 0 <= actual_change, "nothing can load layers during this test"
|
||||
assert actual_change >= target, "must evict more than half"
|
||||
assert (
|
||||
response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
|
||||
), "report accurately evicted bytes"
|
||||
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
|
||||
|
||||
|
||||
def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
|
||||
"""
|
||||
Override tenant min resident and ensure that it will be respected by eviction.
|
||||
"""
|
||||
env = eviction_env
|
||||
ps_http = env.pageserver_http
|
||||
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
du_by_timeline = env.du_by_timeline()
|
||||
log.info("du_by_timeline: %s", du_by_timeline)
|
||||
|
||||
assert len(du_by_timeline) == 2, "this test assumes two tenants"
|
||||
large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__)
|
||||
small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__)
|
||||
assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant]
|
||||
assert (
|
||||
du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size
|
||||
), "ensure this test will do more than 1 eviction"
|
||||
|
||||
# Give the larger tenant a haircut while preventing the smaller tenant from getting one.
|
||||
# To prevent the smaller from getting a haircut, we set min_resident_size to its current size.
|
||||
# To ensure the larger tenant is getting a haircut, any non-zero `target` will do.
|
||||
min_resident_size = du_by_timeline[small_tenant]
|
||||
target = 1
|
||||
assert (
|
||||
du_by_timeline[large_tenant] > min_resident_size
|
||||
), "ensure the larger tenant will get a haircut"
|
||||
ps_http.patch_tenant_config_client_side(
|
||||
small_tenant[0], {"min_resident_size_override": min_resident_size}
|
||||
)
|
||||
ps_http.patch_tenant_config_client_side(
|
||||
large_tenant[0], {"min_resident_size_override": min_resident_size}
|
||||
)
|
||||
|
||||
# Make the large tenant more-recently used. An incorrect implemention would try to evict
|
||||
# the smaller tenant completely first, before turning to the larger tenant,
|
||||
# since the smaller tenant's layers are least-recently-used.
|
||||
env.warm_up_tenant(large_tenant[0])
|
||||
|
||||
# do one run
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
log.info(f"{response}")
|
||||
|
||||
time.sleep(1) # give log time to flush
|
||||
assert not env.neon_env.pageserver.log_contains(
|
||||
GLOBAL_LRU_LOG_LINE,
|
||||
), "this test is pointless if it fell back to global LRU"
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
later_du_by_timeline = env.du_by_timeline()
|
||||
log.info("later_du_by_timeline: %s", later_du_by_timeline)
|
||||
|
||||
actual_change = total_on_disk - later_total_on_disk
|
||||
assert 0 <= actual_change, "nothing can load layers during this test"
|
||||
assert actual_change >= target, "eviction must always evict more than target"
|
||||
assert (
|
||||
response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
|
||||
), "report accurately evicted bytes"
|
||||
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
|
||||
|
||||
assert (
|
||||
later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant]
|
||||
), "small tenant sees no haircut"
|
||||
assert (
|
||||
later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant]
|
||||
), "large tenant gets a haircut"
|
||||
assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
|
||||
|
||||
|
||||
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
|
||||
"""
|
||||
If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
|
||||
we should continue to evict layers following global LRU.
|
||||
"""
|
||||
env = eviction_env
|
||||
ps_http = env.pageserver_http
|
||||
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
target = total_on_disk
|
||||
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
actual_change = total_on_disk - later_total_on_disk
|
||||
assert 0 <= actual_change, "nothing can load layers during this test"
|
||||
assert actual_change >= target, "eviction must always evict more than target"
|
||||
|
||||
time.sleep(1) # give log time to flush
|
||||
assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
|
||||
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
|
||||
|
||||
|
||||
def test_partial_evict_tenant(eviction_env: EvictionEnv):
|
||||
"""
|
||||
Warm up a tenant, then build up pressure to cause in evictions in both.
|
||||
We expect
|
||||
* the default min resident size to be respect (largest layer file size)
|
||||
* the warmed-up tenants layers above min resident size to be evicted after the cold tenant's.
|
||||
"""
|
||||
env = eviction_env
|
||||
ps_http = env.pageserver_http
|
||||
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
du_by_timeline = env.du_by_timeline()
|
||||
|
||||
# pick any tenant
|
||||
[our_tenant, other_tenant] = list(du_by_timeline.keys())
|
||||
(tenant_id, timeline_id) = our_tenant
|
||||
|
||||
# make our tenant more recently used than the other one
|
||||
env.warm_up_tenant(tenant_id)
|
||||
|
||||
# Build up enough pressure to require evictions from both tenants,
|
||||
# but not enough to fall into global LRU.
|
||||
# So, set target to all occipied space, except 2*env.layer_size per tenant
|
||||
target = (
|
||||
du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
|
||||
)
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
actual_change = total_on_disk - later_total_on_disk
|
||||
assert 0 <= actual_change, "nothing can load layers during this test"
|
||||
assert actual_change >= target, "eviction must always evict more than target"
|
||||
|
||||
later_du_by_timeline = env.du_by_timeline()
|
||||
for tenant, later_tenant_usage in later_du_by_timeline.items():
|
||||
assert (
|
||||
later_tenant_usage < du_by_timeline[tenant]
|
||||
), "all tenants should have lost some layers"
|
||||
|
||||
assert (
|
||||
later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
|
||||
), "our warmed up tenant should be at about half capacity, part 1"
|
||||
assert (
|
||||
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
|
||||
# So, check for up to 3 here.
|
||||
later_du_by_timeline[our_tenant]
|
||||
< 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
|
||||
), "our warmed up tenant should be at about half capacity, part 2"
|
||||
assert (
|
||||
later_du_by_timeline[other_tenant] < 2 * env.layer_size
|
||||
), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
|
||||
|
||||
|
||||
def poor_mans_du(
|
||||
env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
|
||||
) -> Tuple[int, int, int]:
|
||||
"""
|
||||
Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
|
||||
this could be done over layers endpoint just as well.
|
||||
"""
|
||||
total_on_disk = 0
|
||||
largest_layer = 0
|
||||
smallest_layer = None
|
||||
for tenant_id, timeline_id in timelines:
|
||||
dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
assert dir.exists(), f"timeline dir does not exist: {dir}"
|
||||
sum = 0
|
||||
for file in dir.iterdir():
|
||||
if "__" not in file.name:
|
||||
continue
|
||||
size = file.stat().st_size
|
||||
sum += size
|
||||
largest_layer = max(largest_layer, size)
|
||||
if smallest_layer:
|
||||
smallest_layer = min(smallest_layer, size)
|
||||
else:
|
||||
smallest_layer = size
|
||||
log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
|
||||
|
||||
log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
|
||||
total_on_disk += sum
|
||||
|
||||
assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
|
||||
return (total_on_disk, largest_layer, smallest_layer or 0)
|
||||
|
||||
|
||||
def test_statvfs_error_handling(eviction_env: EvictionEnv):
|
||||
"""
|
||||
We should log an error that statvfs fails.
|
||||
"""
|
||||
env = eviction_env
|
||||
env.neon_env.pageserver.stop()
|
||||
env.pageserver_start_with_disk_usage_eviction(
|
||||
period="1s",
|
||||
max_usage_pct=90,
|
||||
min_avail_bytes=0,
|
||||
mock_behavior={
|
||||
"type": "Failure",
|
||||
"mocked_error": "EIO",
|
||||
},
|
||||
)
|
||||
|
||||
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
|
||||
env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
|
||||
|
||||
|
||||
def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
|
||||
"""
|
||||
If statvfs data shows 100% usage, the eviction task will drive it down to
|
||||
the configured max_usage_pct.
|
||||
"""
|
||||
env = eviction_env
|
||||
|
||||
env.neon_env.pageserver.stop()
|
||||
|
||||
# make it seem like we're at 100% utilization by setting total bytes to the used bytes
|
||||
total_size, _, _ = env.timelines_du()
|
||||
blocksize = 512
|
||||
total_blocks = (total_size + (blocksize - 1)) // blocksize
|
||||
|
||||
env.pageserver_start_with_disk_usage_eviction(
|
||||
period="1s",
|
||||
max_usage_pct=33,
|
||||
min_avail_bytes=0,
|
||||
mock_behavior={
|
||||
"type": "Success",
|
||||
"blocksize": blocksize,
|
||||
"total_blocks": total_blocks,
|
||||
# Only count layer files towards used bytes in the mock_statvfs.
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
)
|
||||
|
||||
def relieved_log_message():
|
||||
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
|
||||
|
||||
wait_until(10, 1, relieved_log_message)
|
||||
|
||||
post_eviction_total_size, _, _ = env.timelines_du()
|
||||
|
||||
assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
|
||||
|
||||
|
||||
def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
|
||||
"""
|
||||
If statvfs data shows 100% usage, the eviction task will drive it down to
|
||||
at least the configured min_avail_bytes.
|
||||
"""
|
||||
env = eviction_env
|
||||
|
||||
env.neon_env.pageserver.stop()
|
||||
|
||||
# make it seem like we're at 100% utilization by setting total bytes to the used bytes
|
||||
total_size, _, _ = env.timelines_du()
|
||||
blocksize = 512
|
||||
total_blocks = (total_size + (blocksize - 1)) // blocksize
|
||||
|
||||
min_avail_bytes = total_size // 3
|
||||
|
||||
env.pageserver_start_with_disk_usage_eviction(
|
||||
period="1s",
|
||||
max_usage_pct=100,
|
||||
min_avail_bytes=min_avail_bytes,
|
||||
mock_behavior={
|
||||
"type": "Success",
|
||||
"blocksize": blocksize,
|
||||
"total_blocks": total_blocks,
|
||||
# Only count layer files towards used bytes in the mock_statvfs.
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
)
|
||||
|
||||
def relieved_log_message():
|
||||
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
|
||||
|
||||
wait_until(10, 1, relieved_log_message)
|
||||
|
||||
post_eviction_total_size, _, _ = env.timelines_du()
|
||||
|
||||
assert (
|
||||
total_size - post_eviction_total_size >= min_avail_bytes
|
||||
), "we requested at least min_avail_bytes worth of free space"
|
||||
@@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
|
||||
n_restarts = 10
|
||||
scale = 10
|
||||
|
||||
# the background task may complete the init task delay after finding an
|
||||
# active tenant, but shutdown starts right before Tenant::gc_iteration
|
||||
env.pageserver.allowed_errors.append(
|
||||
r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant"
|
||||
)
|
||||
|
||||
def run_pgbench(pg: Postgres):
|
||||
connstr = pg.connstr()
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
|
||||
@@ -11,10 +11,8 @@ from typing import Dict, List, Tuple
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
LocalFsStorage,
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
wait_for_last_flush_lsn,
|
||||
@@ -423,6 +421,23 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
def get_queued_count(file_kind, op_kind):
|
||||
val = client.get_remote_timeline_client_metric(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op_kind,
|
||||
)
|
||||
if val is None:
|
||||
return val
|
||||
return int(val)
|
||||
|
||||
def wait_upload_queue_empty():
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
calls_started: Dict[Tuple[str, str], List[int]] = {
|
||||
("layer", "upload"): [0],
|
||||
("index", "upload"): [0],
|
||||
@@ -463,7 +478,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
# create some layers & wait for uploads to finish
|
||||
churn("a", "b")
|
||||
|
||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
wait_upload_queue_empty()
|
||||
|
||||
# ensure that we updated the calls_started metric
|
||||
fetch_calls_started()
|
||||
@@ -622,147 +637,4 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
|
||||
# Ensures that such branch is still persisted on the remote storage, and can be restored during tenant (re)attach.
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_empty_branch_remote_storage_upload(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_empty_branch_remote_storage_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
new_branch_name = "new_branch"
|
||||
new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
|
||||
|
||||
with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg:
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id)
|
||||
wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
|
||||
|
||||
timelines_before_detach = set(
|
||||
map(
|
||||
lambda t: TimelineId(t["timeline_id"]),
|
||||
client.timeline_list(env.initial_tenant),
|
||||
)
|
||||
)
|
||||
expected_timelines = set([env.initial_timeline, new_branch_timeline_id])
|
||||
assert (
|
||||
timelines_before_detach == expected_timelines
|
||||
), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
|
||||
|
||||
client.tenant_detach(env.initial_tenant)
|
||||
client.tenant_attach(env.initial_tenant)
|
||||
wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
|
||||
|
||||
timelines_after_detach = set(
|
||||
map(
|
||||
lambda t: TimelineId(t["timeline_id"]),
|
||||
client.timeline_list(env.initial_tenant),
|
||||
)
|
||||
)
|
||||
|
||||
assert (
|
||||
timelines_before_detach == timelines_after_detach
|
||||
), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"
|
||||
|
||||
|
||||
# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
|
||||
# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart.
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_empty_branch_remote_storage_upload_on_restart(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_empty_branch_remote_storage_upload_on_restart",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
new_branch_name = "new_branch"
|
||||
new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
|
||||
|
||||
with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg:
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id)
|
||||
wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# Remove new branch from the remote storage
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
new_branch_on_remote_storage = (
|
||||
env.remote_storage.root
|
||||
/ "tenants"
|
||||
/ str(env.initial_tenant)
|
||||
/ "timelines"
|
||||
/ str(new_branch_timeline_id)
|
||||
)
|
||||
assert (
|
||||
new_branch_on_remote_storage.is_dir()
|
||||
), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage"
|
||||
shutil.rmtree(new_branch_on_remote_storage)
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
|
||||
assert (
|
||||
new_branch_on_remote_storage.is_dir()
|
||||
), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"
|
||||
|
||||
|
||||
def wait_upload_queue_empty(
|
||||
client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
|
||||
|
||||
def get_queued_count(
|
||||
client: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: str,
|
||||
op_kind: str,
|
||||
):
|
||||
val = client.get_remote_timeline_client_metric(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op_kind,
|
||||
)
|
||||
if val is None:
|
||||
return val
|
||||
return int(val)
|
||||
|
||||
|
||||
# TODO Test that we correctly handle GC of files that are stuck in upload queue.
|
||||
|
||||
@@ -225,7 +225,7 @@ def test_tenant_reattach_while_busy(
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* not found.*")
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Tenant .* will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
@@ -257,20 +257,18 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
|
||||
env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
|
||||
|
||||
# first check for non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"NotFound: tenant {tenant_id}",
|
||||
) as excinfo:
|
||||
match=f"Tenant not found for id {tenant_id}",
|
||||
):
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
assert excinfo.value.status_code == 404
|
||||
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant *")
|
||||
env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
|
||||
|
||||
# create new nenant
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
@@ -296,7 +294,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
|
||||
# Timelines get stopped during detach, ignore the gc calls that error, witnessing that
|
||||
# Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
|
||||
env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
|
||||
|
||||
# Detach while running manual GC.
|
||||
@@ -322,96 +320,12 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
|
||||
expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found"
|
||||
):
|
||||
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
|
||||
|
||||
# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
|
||||
# then with parameters to force ignored tenant detach (should not fail).
|
||||
def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# create a new tenant
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
# we rely upon autocommit after each statement
|
||||
pg.safe_psql_many(
|
||||
queries=[
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
]
|
||||
)
|
||||
|
||||
# ignore tenant
|
||||
client.tenant_ignore(tenant_id)
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
|
||||
# ensure tenant couldn't be detached without the special flag for ignored tenant
|
||||
log.info("detaching ignored tenant WITHOUT required flag")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
|
||||
):
|
||||
client.tenant_detach(tenant_id)
|
||||
|
||||
log.info("tenant detached failed as expected")
|
||||
|
||||
# ensure tenant is detached with ignore state
|
||||
log.info("detaching ignored tenant with required flag")
|
||||
client.tenant_detach(tenant_id, True)
|
||||
log.info("ignored tenant detached without error")
|
||||
|
||||
# check that nothing is left on disk for deleted tenant
|
||||
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
# assert the tenant does not exists in the Pageserver
|
||||
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
||||
assert (
|
||||
tenant_id not in tenants_after_detach
|
||||
), f"Ignored and then detached tenant {tenant_id} \
|
||||
should not be present in pageserver's memory"
|
||||
|
||||
|
||||
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
|
||||
# Tenant should be detached without issues.
|
||||
def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# create a new tenant
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
# we rely upon autocommit after each statement
|
||||
pg.safe_psql_many(
|
||||
queries=[
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
]
|
||||
)
|
||||
|
||||
log.info("detaching regular tenant with detach ignored flag")
|
||||
client.tenant_detach(tenant_id, True)
|
||||
log.info("regular tenant detached without error")
|
||||
|
||||
# check that nothing is left on disk for deleted tenant
|
||||
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
|
||||
|
||||
# assert the tenant does not exists in the Pageserver
|
||||
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
||||
assert (
|
||||
tenant_id not in tenants_after_detach
|
||||
), f"Ignored and then detached tenant {tenant_id} \
|
||||
should not be present in pageserver's memory"
|
||||
|
||||
|
||||
#
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
def test_detach_while_attaching(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
|
||||
@@ -434,13 +434,10 @@ def test_tenant_relocation(
|
||||
)
|
||||
|
||||
# rewrite neon cli config to use new pageserver for basebackup to start new compute
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
cli_config_lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
|
||||
cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(cli_config_lines))
|
||||
|
||||
old_local_path_main = switch_pg_to_new_pageserver(
|
||||
env,
|
||||
@@ -499,10 +496,7 @@ def test_tenant_relocation(
|
||||
|
||||
# bring old pageserver back for clean shutdown via neon cli
|
||||
# new pageserver will be shut down by the context manager
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
cli_config_lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
|
||||
cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(cli_config_lines))
|
||||
|
||||
@@ -3,7 +3,6 @@ import shutil
|
||||
import time
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@@ -88,7 +87,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_1, _ = env.neon_cli.create_tenant()
|
||||
@@ -124,17 +122,6 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
ps_metrics = all_metrics[0]
|
||||
sk_metrics = all_metrics[1:]
|
||||
|
||||
# Find all metrics among all safekeepers, accepts the same arguments as query_all()
|
||||
def query_all_safekeepers(name, filter):
|
||||
return list(
|
||||
chain.from_iterable(
|
||||
map(
|
||||
lambda sk: sk.query_all(name, filter),
|
||||
sk_metrics,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
ttids = [
|
||||
{"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)},
|
||||
{"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)},
|
||||
@@ -175,40 +162,6 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
|
||||
)
|
||||
|
||||
for io_direction in ["read", "write"]:
|
||||
# Querying all metrics for number of bytes read/written by pageserver in another AZ
|
||||
io_metrics = query_all_safekeepers(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
{
|
||||
"app_name": "pageserver",
|
||||
"client_az": "test_ps_az",
|
||||
"dir": io_direction,
|
||||
"same_az": "false",
|
||||
},
|
||||
)
|
||||
total_bytes = sum(int(metric.value) for metric in io_metrics)
|
||||
log.info(f"Pageserver {io_direction} bytes from another AZ: {total_bytes}")
|
||||
# We expect some bytes to be read/written, to make sure metrics are working
|
||||
assert total_bytes > 0
|
||||
|
||||
# Test (a subset of) safekeeper global metrics
|
||||
for sk_m in sk_metrics:
|
||||
# Test that every safekeeper has read some bytes
|
||||
assert any(
|
||||
map(
|
||||
lambda x: x.value > 0,
|
||||
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "read"}),
|
||||
)
|
||||
), f"{sk_m.name} has not read bytes"
|
||||
|
||||
# Test that every safekeeper has written some bytes
|
||||
assert any(
|
||||
map(
|
||||
lambda x: x.value > 0,
|
||||
sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "write"}),
|
||||
)
|
||||
), f"{sk_m.name} has not written bytes"
|
||||
|
||||
# Test (a subset of) pageserver global metrics
|
||||
for metric in PAGESERVER_GLOBAL_METRICS:
|
||||
ps_samples = ps_metrics.query_all(metric, {})
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
@@ -216,9 +217,208 @@ def test_tenants_attached_after_download(
|
||||
assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_upgrades_index_json_from_v0(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
# the "image" for the v0 index_part.json. the fields themselves are
|
||||
# replaced with values read from the later version because of #2592 (initdb
|
||||
# lsn not reproducible).
|
||||
v0_skeleton = json.loads(
|
||||
"""{
|
||||
"timeline_layers":[
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
|
||||
],
|
||||
"missing_layers":["This should not fail as its not used anymore"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[]
|
||||
}"""
|
||||
)
|
||||
|
||||
# getting a too eager compaction happening for this test would not play
|
||||
# well with the strict assertions.
|
||||
neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_tenant_upgrades_index_json_from_v0"
|
||||
)
|
||||
|
||||
# launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
|
||||
# then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# flush, wait until in remote storage
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
# remove all local data for the tenant to force redownloading and subsequent upgrade
|
||||
shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
|
||||
|
||||
# downgrade the remote file
|
||||
timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
|
||||
with open(timeline_path, "r+") as timeline_file:
|
||||
# keep the deserialized for later inspection
|
||||
orig_index_part = json.load(timeline_file)
|
||||
|
||||
v0_index_part = {
|
||||
key: orig_index_part[key]
|
||||
for key in v0_skeleton.keys() - ["missing_layers"] # pgserver doesn't have it anymore
|
||||
}
|
||||
|
||||
timeline_file.seek(0)
|
||||
json.dump(v0_index_part, timeline_file)
|
||||
timeline_file.truncate(timeline_file.tell())
|
||||
|
||||
env.pageserver.start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
wait_until(
|
||||
number_of_iterations=5,
|
||||
interval=1,
|
||||
func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
|
||||
)
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# not needed anymore
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
# make sure the file has been upgraded back to how it started
|
||||
index_part = local_fs_index_part(env, tenant_id, timeline_id)
|
||||
assert index_part["version"] == orig_index_part["version"]
|
||||
assert "missing_layers" not in index_part.keys()
|
||||
|
||||
# expect one more layer because of the forced checkpoint
|
||||
assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1
|
||||
|
||||
# all of the same layer files are there, but they might be shuffled around
|
||||
orig_layers = set(orig_index_part["timeline_layers"])
|
||||
later_layers = set(index_part["timeline_layers"])
|
||||
assert later_layers.issuperset(orig_layers)
|
||||
|
||||
added_layers = later_layers - orig_layers
|
||||
assert len(added_layers) == 1
|
||||
|
||||
# all of metadata has been regenerated (currently just layer file size)
|
||||
all_metadata_keys = set()
|
||||
for layer in orig_layers:
|
||||
orig_metadata = orig_index_part["layer_metadata"][layer]
|
||||
new_metadata = index_part["layer_metadata"][layer]
|
||||
assert (
|
||||
orig_metadata == new_metadata
|
||||
), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}"
|
||||
all_metadata_keys |= set(orig_metadata.keys())
|
||||
|
||||
one_new_layer = next(iter(added_layers))
|
||||
assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata"
|
||||
|
||||
only_new_metadata = index_part["layer_metadata"][one_new_layer]
|
||||
|
||||
assert (
|
||||
set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set()
|
||||
), "new layer metadata has same metadata as others"
|
||||
|
||||
|
||||
# FIXME: test index_part.json getting downgraded from imaginary new version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_ignores_backup_file(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
# getting a too eager compaction happening for this test would not play
|
||||
# well with the strict assertions.
|
||||
neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
|
||||
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file")
|
||||
|
||||
# launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
|
||||
# then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# flush, wait until in remote storage
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
# change the remote file to have entry with .0.old suffix
|
||||
timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
|
||||
with open(timeline_path, "r+") as timeline_file:
|
||||
# keep the deserialized for later inspection
|
||||
orig_index_part = json.load(timeline_file)
|
||||
backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old"
|
||||
orig_index_part["timeline_layers"].append(backup_layer_name)
|
||||
|
||||
timeline_file.seek(0)
|
||||
json.dump(orig_index_part, timeline_file)
|
||||
|
||||
env.pageserver.start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
wait_until(
|
||||
number_of_iterations=5,
|
||||
interval=1,
|
||||
func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
|
||||
)
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# not needed anymore
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
# the .old file is gone from newly serialized index_part
|
||||
new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
|
||||
backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
|
||||
assert len(list(backup_layers)) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_redownloads_truncated_file_on_startup(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
|
||||
@@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
|
||||
env.pageserver.allowed_errors.append(".*timeline not found.*")
|
||||
env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -24,12 +24,10 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
invalid_tenant_id = TenantId.generate()
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match=f"NotFound: tenant {invalid_tenant_id}",
|
||||
) as exc:
|
||||
match=f"Tenant {invalid_tenant_id} not found in the local state",
|
||||
):
|
||||
ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id)
|
||||
|
||||
assert exc.value.status_code == 404
|
||||
|
||||
# construct pair of branches to validate that pageserver prohibits
|
||||
# deletion of ancestor timelines when they have child branches
|
||||
parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty")
|
||||
@@ -41,7 +39,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
ps_http = env.pageserver.http_client()
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="Cannot delete timeline which has child timelines"
|
||||
) as exc:
|
||||
):
|
||||
timeline_path = (
|
||||
env.repo_dir
|
||||
/ "tenants"
|
||||
@@ -55,8 +53,6 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
assert exc.value.status_code == 400
|
||||
|
||||
timeline_path = (
|
||||
env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
|
||||
)
|
||||
@@ -75,7 +71,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
|
||||
) as exc:
|
||||
):
|
||||
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
|
||||
|
||||
# FIXME leaves tenant without timelines, should we prevent deletion of root timeline?
|
||||
@@ -84,5 +80,3 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
interval=0.2,
|
||||
func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id),
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 404
|
||||
|
||||
@@ -23,7 +23,7 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
|
||||
def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
# We intentionally test for a non-existent tenant.
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant.*")
|
||||
env.pageserver.allowed_errors.append(".*Tenant not found.*")
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
|
||||
@@ -34,7 +34,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = TenantId.generate()
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"NotFound: tenant {tenant_id}",
|
||||
match=f"Tenant not found for id {tenant_id}",
|
||||
):
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
|
||||
12
tmp-proxy-test/Dockerfile
Normal file
12
tmp-proxy-test/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
FROM neondatabase/neon:local
|
||||
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y curl
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_19.x | bash - && apt-get install -y nodejs
|
||||
|
||||
COPY . /tmp-proxy-test
|
||||
RUN chown -R neon:neon /tmp-proxy-test
|
||||
USER neon
|
||||
RUN cd /tmp-proxy-test && npm install
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "/tmp-proxy-test/entrypoint.sh"]
|
||||
15
tmp-proxy-test/entrypoint.sh
Normal file
15
tmp-proxy-test/entrypoint.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
# start pg
|
||||
export LD_LIBRARY_PATH=/usr/local/v15/lib
|
||||
/usr/local/v15/bin/initdb -D /data/data
|
||||
/usr/local/v15/bin/pg_ctl -D /data/data -l logfile start
|
||||
/usr/local/v15/bin/psql postgres://neon@localhost:5432/postgres -c "alter user neon with password 'test'"
|
||||
|
||||
# start proxy
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
|
||||
/usr/local/bin/proxy --wss 127.0.0.1:7500 -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://neon@localhost:5432/postgres &
|
||||
|
||||
# run test
|
||||
cd /tmp-proxy-test
|
||||
node index.js
|
||||
27
tmp-proxy-test/index.js
Normal file
27
tmp-proxy-test/index.js
Normal file
@@ -0,0 +1,27 @@
|
||||
import { Client, neonConfig } from '@neondatabase/serverless'
|
||||
|
||||
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
neonConfig.wsProxy = (host, _port) => host + ':7500/v2'
|
||||
neonConfig.pipelineConnect = "password";
|
||||
|
||||
const client = new Client({
|
||||
connectionString: 'postgres://neon:test@pg.localtest.me/postgres',
|
||||
});
|
||||
client.connect();
|
||||
const { rows: [{ now }] } = await client.query('SELECT now()');
|
||||
console.log(now);
|
||||
console.log("==================================");
|
||||
console.log("WSS proxy success!");
|
||||
console.log("==================================");
|
||||
process.exit(0);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
console.log("==================================");
|
||||
console.log("WSS proxy failed!");
|
||||
console.log("==================================");
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
181
tmp-proxy-test/package-lock.json
generated
Normal file
181
tmp-proxy-test/package-lock.json
generated
Normal file
@@ -0,0 +1,181 @@
|
||||
{
|
||||
"name": "proxy-test",
|
||||
"version": "1.0.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "proxy-test",
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": "^0.2.8",
|
||||
"pg": "^8.10.0",
|
||||
"ws": "^8.13.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@neondatabase/serverless": {
|
||||
"version": "0.2.8",
|
||||
"resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz",
|
||||
"integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA=="
|
||||
},
|
||||
"node_modules/buffer-writer": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/buffer-writer/-/buffer-writer-2.0.0.tgz",
|
||||
"integrity": "sha512-a7ZpuTZU1TRtnwyCNW3I5dc0wWNC3VR9S++Ewyk2HHZdrO3CQJqSpd+95Us590V6AL7JqUAH2IwZ/398PmNFgw==",
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/packet-reader": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/packet-reader/-/packet-reader-1.0.0.tgz",
|
||||
"integrity": "sha512-HAKu/fG3HpHFO0AA8WE8q2g+gBJaZ9MG7fcKk+IJPLTGAD6Psw4443l+9DGRbOIh3/aXr7Phy0TjilYivJo5XQ=="
|
||||
},
|
||||
"node_modules/pg": {
|
||||
"version": "8.10.0",
|
||||
"resolved": "https://registry.npmjs.org/pg/-/pg-8.10.0.tgz",
|
||||
"integrity": "sha512-ke7o7qSTMb47iwzOSaZMfeR7xToFdkE71ifIipOAAaLIM0DYzfOAXlgFFmYUIE2BcJtvnVlGCID84ZzCegE8CQ==",
|
||||
"dependencies": {
|
||||
"buffer-writer": "2.0.0",
|
||||
"packet-reader": "1.0.0",
|
||||
"pg-connection-string": "^2.5.0",
|
||||
"pg-pool": "^3.6.0",
|
||||
"pg-protocol": "^1.6.0",
|
||||
"pg-types": "^2.1.0",
|
||||
"pgpass": "1.x"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 8.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"pg-native": ">=3.0.1"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"pg-native": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/pg-connection-string": {
|
||||
"version": "2.5.0",
|
||||
"resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.5.0.tgz",
|
||||
"integrity": "sha512-r5o/V/ORTA6TmUnyWZR9nCj1klXCO2CEKNRlVuJptZe85QuhFayC7WeMic7ndayT5IRIR0S0xFxFi2ousartlQ=="
|
||||
},
|
||||
"node_modules/pg-int8": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz",
|
||||
"integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==",
|
||||
"engines": {
|
||||
"node": ">=4.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/pg-pool": {
|
||||
"version": "3.6.0",
|
||||
"resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.6.0.tgz",
|
||||
"integrity": "sha512-clFRf2ksqd+F497kWFyM21tMjeikn60oGDmqMT8UBrynEwVEX/5R5xd2sdvdo1cZCFlguORNpVuqxIj+aK4cfQ==",
|
||||
"peerDependencies": {
|
||||
"pg": ">=8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/pg-protocol": {
|
||||
"version": "1.6.0",
|
||||
"resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz",
|
||||
"integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q=="
|
||||
},
|
||||
"node_modules/pg-types": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz",
|
||||
"integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==",
|
||||
"dependencies": {
|
||||
"pg-int8": "1.0.1",
|
||||
"postgres-array": "~2.0.0",
|
||||
"postgres-bytea": "~1.0.0",
|
||||
"postgres-date": "~1.0.4",
|
||||
"postgres-interval": "^1.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/pgpass": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/pgpass/-/pgpass-1.0.5.tgz",
|
||||
"integrity": "sha512-FdW9r/jQZhSeohs1Z3sI1yxFQNFvMcnmfuj4WBMUTxOrAyLMaTcE1aAMBiTlbMNaXvBCQuVi0R7hd8udDSP7ug==",
|
||||
"dependencies": {
|
||||
"split2": "^4.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/postgres-array": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz",
|
||||
"integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==",
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/postgres-bytea": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz",
|
||||
"integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/postgres-date": {
|
||||
"version": "1.0.7",
|
||||
"resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz",
|
||||
"integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/postgres-interval": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz",
|
||||
"integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==",
|
||||
"dependencies": {
|
||||
"xtend": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/split2": {
|
||||
"version": "4.1.0",
|
||||
"resolved": "https://registry.npmjs.org/split2/-/split2-4.1.0.tgz",
|
||||
"integrity": "sha512-VBiJxFkxiXRlUIeyMQi8s4hgvKCSjtknJv/LVYbrgALPwf5zSKmEwV9Lst25AkvMDnvxODugjdl6KZgwKM1WYQ==",
|
||||
"engines": {
|
||||
"node": ">= 10.x"
|
||||
}
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.13.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
|
||||
"integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/xtend": {
|
||||
"version": "4.0.2",
|
||||
"resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
|
||||
"integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
|
||||
"engines": {
|
||||
"node": ">=0.4"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user