Compare commits

..

12 Commits

Author SHA1 Message Date
John Spray
2ce2574aa4 docs: pageserver controller rfc 2023-09-29 18:24:24 +01:00
John Spray
dc5f107170 docs: sharding phase 1 RFC 2023-09-29 18:20:13 +01:00
John Spray
1569446396 clarifications 2023-09-27 10:38:52 +01:00
John Spray
a8143a3bed Align cutover downloads with heatmap 2023-09-27 10:38:52 +01:00
John Spray
689b6f14b7 Apply suggestions from code review
Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-09-27 10:38:41 +01:00
John Spray
9c1c06ad17 Bump number of migration RFC 2023-09-26 10:57:11 +01:00
John Spray
40d2a73a0c Merge remote-tracking branch 'upstream/main' into jcsp/rfc-migration 2023-09-26 10:56:57 +01:00
John Spray
89ddefb428 Note safety requirement for AttachedMulti & out of scope item 2023-09-12 14:15:46 +01:00
John Spray
cad0799521 Mention disabling consumption metrics in AttachedStale 2023-09-12 14:04:01 +01:00
John Spray
1143e2e9ce Clarifications 2023-09-12 12:13:29 +01:00
Christian Schwarz
ef3e75abc3 for #5029 (rfc tenant migrations): editorial fixes (#5185) 2023-09-01 18:10:44 +01:00
John Spray
cfb285139c docs/rfcs: add RFC for fast tenant migration/failover 2023-08-31 10:55:17 +01:00
153 changed files with 5154 additions and 10490 deletions

View File

@@ -76,8 +76,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.24.0
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
ALLURE_VERSION: 2.23.1
ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock

View File

@@ -834,7 +834,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.12
VM_BUILDER_VERSION: v0.17.11
steps:
- name: Checkout
@@ -1092,10 +1092,8 @@ jobs:
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"

View File

@@ -5,7 +5,7 @@
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/storage
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/safekeepers

57
Cargo.lock generated
View File

@@ -798,22 +798,6 @@ dependencies = [
"either",
]
[[package]]
name = "camino"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
[[package]]
name = "camino-tempfile"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ab15a83d13f75dbd86f082bdefd160b628476ef58d3b900a0ef74e001bb097"
dependencies = [
"camino",
"tempfile",
]
[[package]]
name = "cast"
version = "0.3.0"
@@ -1069,7 +1053,6 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"camino",
"clap",
"comfy-table",
"compute_api",
@@ -1797,9 +1780,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.3.3"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "hex"
@@ -2061,7 +2053,7 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"libc",
"windows-sys 0.48.0",
]
@@ -2078,7 +2070,7 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix 0.37.19",
"windows-sys 0.48.0",
@@ -2452,11 +2444,11 @@ dependencies = [
[[package]]
name = "num_cpus"
version = "1.16.0"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
dependencies = [
"hermit-abi",
"hermit-abi 0.2.6",
"libc",
]
@@ -2658,7 +2650,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"camino",
"clap",
"git-version",
"pageserver",
@@ -2679,8 +2670,6 @@ dependencies = [
"async-trait",
"byteorder",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"close_fds",
@@ -2732,6 +2721,7 @@ dependencies = [
"strum_macros",
"svg_fmt",
"sync_wrapper",
"tempfile",
"tenant_size_model",
"thiserror",
"tokio",
@@ -3256,7 +3246,6 @@ dependencies = [
"reqwest-tracing",
"routerify",
"rstest",
"rustc-hash",
"rustls",
"rustls-pemfile",
"scopeguard",
@@ -3424,16 +3413,14 @@ dependencies = [
"aws-sdk-s3",
"aws-smithy-http",
"aws-types",
"camino",
"camino-tempfile",
"hyper",
"metrics",
"once_cell",
"pin-project-lite",
"rand",
"scopeguard",
"serde",
"serde_json",
"tempfile",
"test-context",
"tokio",
"tokio-util",
@@ -3785,8 +3772,6 @@ dependencies = [
"async-trait",
"byteorder",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"const_format",
@@ -3815,6 +3800,7 @@ dependencies = [
"serde_with",
"signal-hook",
"storage_broker",
"tempfile",
"thiserror",
"tokio",
"tokio-io-timeout",
@@ -5113,8 +5099,6 @@ dependencies = [
"bincode",
"byteorder",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"const_format",
"criterion",
@@ -5140,6 +5124,7 @@ dependencies = [
"signal-hook",
"strum",
"strum_macros",
"tempfile",
"thiserror",
"tokio",
"tokio-stream",
@@ -5213,7 +5198,6 @@ name = "wal_craft"
version = "0.1.0"
dependencies = [
"anyhow",
"camino-tempfile",
"clap",
"env_logger",
"log",
@@ -5221,6 +5205,7 @@ dependencies = [
"postgres",
"postgres_ffi",
"regex",
"tempfile",
"utils",
"workspace_hack",
]

View File

@@ -51,7 +51,6 @@ bindgen = "0.65"
bstr = "1.0"
byteorder = "1.4"
bytes = "1.0"
camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
@@ -108,7 +107,6 @@ reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
rpds = "0.13"
rustc-hash = "1.1.0"
rustls = "0.21"
rustls-pemfile = "1"
rustls-split = "0.3"
@@ -188,7 +186,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
criterion = "0.5.1"
rcgen = "0.11"
rstest = "0.18"
camino-tempfile = "1.0.2"
tempfile = "3.4"
tonic-build = "0.9"
[patch.crates-io]

View File

@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
FROM build-deps AS plpgsql-check-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -615,7 +615,11 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
#########################################################################################
#
# Layer "rust extensions"
# This layer is used to build `pgrx` deps
# This layer is used to build `pgx` deps
#
# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
# dependency on all the rust extension that depend on it, too.
#
#########################################################################################
FROM build-deps AS rust-extensions-build
@@ -631,12 +635,22 @@ USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --locked --version 0.10.2 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
cargo install --locked --version 0.7.3 cargo-pgx && \
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
@@ -650,11 +664,23 @@ USER root
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
#########################################################################################
@@ -667,11 +693,26 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in
# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
# same 1.1 version we've used before.
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
cargo pgx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -686,11 +727,21 @@ RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
cargo pgrx install --release && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
@@ -703,15 +754,21 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
echo "********************************************************************************************************" && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################

View File

@@ -1039,7 +1039,7 @@ LIMIT 100",
let remote_extensions = spec
.remote_extensions
.as_ref()
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
.ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
info!("parse shared_preload_libraries from spec.cluster.settings");
let mut libs_vec = Vec::new();

View File

@@ -1,5 +1,5 @@
use std::sync::Arc;
use std::{thread, time::Duration};
use std::{thread, time};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};
use crate::compute::ComputeNode;
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
// Spin in a loop and figure out the last activity time in the Postgres.
// Then update it in the shared state. This function never errors out.
@@ -17,12 +17,13 @@ fn watch_compute_activity(compute: &ComputeNode) {
let connstr = compute.connstr.as_str();
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = Client::connect(connstr, NoTls);
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
info!("watching Postgres activity at {}", connstr);
loop {
// Should be outside of the write lock to allow others to read while we sleep.
thread::sleep(MONITOR_CHECK_INTERVAL);
thread::sleep(timeout);
match &mut client {
Ok(cli) => {

View File

@@ -6,7 +6,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
git-version.workspace = true

View File

@@ -1,6 +1,5 @@
use crate::{background_process, local_env::LocalEnv};
use anyhow::anyhow;
use camino::Utf8PathBuf;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::{path::PathBuf, process::Child};
@@ -48,9 +47,8 @@ impl AttachmentService {
}
}
fn pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
.expect("non-Unicode path")
fn pid_file(&self) -> PathBuf {
self.env.base_data_dir.join("attachment_service.pid")
}
pub fn start(&self) -> anyhow::Result<Child> {

View File

@@ -16,13 +16,12 @@ use std::ffi::OsStr;
use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use nix::errno::Errno;
use nix::fcntl::{FcntlArg, FdFlag};
use nix::sys::signal::{kill, Signal};
@@ -46,9 +45,9 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
/// it itself.
pub enum InitialPidFile<'t> {
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
Create(&'t Utf8Path),
Create(&'t Path),
/// The process will create the pidfile itself, need to wait for that event.
Expect(&'t Utf8Path),
Expect(&'t Path),
}
/// Start a background child process using the parameters given.
@@ -138,11 +137,7 @@ where
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(
immediate: bool,
process_name: &str,
pid_file: &Utf8Path,
) -> anyhow::Result<()> {
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
let pid = match pid_file::read(pid_file)
.with_context(|| format!("read pid_file {pid_file:?}"))?
{
@@ -257,9 +252,9 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
/// will remain held until the cmd exits.
fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
where
P: Into<Utf8PathBuf>,
P: Into<PathBuf>,
{
let path: Utf8PathBuf = path.into();
let path: PathBuf = path.into();
// SAFETY
// pre_exec is marked unsafe because it runs between fork and exec.
// Why is that dangerous in various ways?
@@ -316,7 +311,7 @@ where
fn process_started<F>(
pid: Pid,
pid_file_to_check: Option<&Utf8Path>,
pid_file_to_check: Option<&Path>,
status_check: &F,
) -> anyhow::Result<bool>
where

View File

@@ -223,7 +223,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
if attach_req.pageserver_id.is_some() {
tenant_state.generation += 1;
}
tenant_state.pageserver = attach_req.pageserver_id;
let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?;

View File

@@ -116,7 +116,6 @@ fn main() -> Result<()> {
"attachment_service" => handle_attachment_service(sub_args, &env),
"safekeeper" => handle_safekeeper(sub_args, &env),
"endpoint" => handle_endpoint(sub_args, &env),
"mappings" => handle_mappings(sub_args, &mut env),
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
_ => bail!("unexpected subcommand {sub_name}"),
};
@@ -817,38 +816,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
Ok(())
}
fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(ep_subcommand_data) => ep_subcommand_data,
None => bail!("no mappings subcommand provided"),
};
match sub_name {
"map" => {
let branch_name = sub_args
.get_one::<String>("branch-name")
.expect("branch-name argument missing");
let tenant_id = sub_args
.get_one::<String>("tenant-id")
.map(|x| TenantId::from_str(x))
.expect("tenant-id argument missing")
.expect("malformed tenant-id arg");
let timeline_id = sub_args
.get_one::<String>("timeline-id")
.map(|x| TimelineId::from_str(x))
.expect("timeline-id argument missing")
.expect("malformed timeline-id arg");
env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
Ok(())
}
other => unimplemented!("mappings subcommand {other}"),
}
}
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1117,7 +1084,6 @@ fn cli() -> Command {
// --id, when using a pageserver command
let pageserver_id_arg = Arg::new("pageserver-id")
.long("id")
.global(true)
.help("pageserver id")
.required(false);
// --pageserver-id when using a non-pageserver command
@@ -1288,20 +1254,17 @@ fn cli() -> Command {
Command::new("pageserver")
.arg_required_else_help(true)
.about("Manage pageserver")
.arg(pageserver_id_arg)
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
.arg(stop_mode_arg.clone())
)
.subcommand(Command::new("restart")
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
.arg(pageserver_id_arg.clone())
.subcommand(Command::new("start").about("Start local pageserver")
.arg(pageserver_id_arg.clone())
.arg(pageserver_config_args.clone()))
.subcommand(Command::new("stop").about("Stop local pageserver")
.arg(pageserver_id_arg.clone())
.arg(stop_mode_arg.clone()))
.subcommand(Command::new("restart").about("Restart local pageserver")
.arg(pageserver_id_arg.clone())
.arg(pageserver_config_args.clone()))
)
.subcommand(
Command::new("attachment_service")
@@ -1358,8 +1321,8 @@ fn cli() -> Command {
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
.arg(endpoint_id_arg.clone())
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(timeline_id_arg.clone())
.arg(branch_name_arg)
.arg(timeline_id_arg)
.arg(lsn_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
@@ -1372,7 +1335,7 @@ fn cli() -> Command {
.subcommand(
Command::new("stop")
.arg(endpoint_id_arg)
.arg(tenant_id_arg.clone())
.arg(tenant_id_arg)
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")
@@ -1383,18 +1346,6 @@ fn cli() -> Command {
)
)
.subcommand(
Command::new("mappings")
.arg_required_else_help(true)
.about("Manage neon_local branch name mappings")
.subcommand(
Command::new("map")
.about("Create new mapping which cannot exist already")
.arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone())
)
)
// Obsolete old name for 'endpoint'. We now just print an error if it's used.
.subcommand(
Command::new("pg")

View File

@@ -7,7 +7,7 @@
//! ```
use anyhow::Context;
use camino::Utf8PathBuf;
use std::path::PathBuf;
use crate::{background_process, local_env};
@@ -30,7 +30,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|| {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
format!("Failed to append /status path to broker endpoint {url}")
format!("Failed to append /status path to broker endpoint {url}",)
})?;
let request = client
.get(status_url)
@@ -50,7 +50,6 @@ pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
}
fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(env.base_data_dir.join("storage_broker.pid"))
.expect("non-Unicode path")
fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("storage_broker.pid")
}

View File

@@ -14,7 +14,6 @@ use std::process::{Child, Command};
use std::{io, result};
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -145,7 +144,7 @@ impl PageServerNode {
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
}
pub fn repo_path(&self) -> PathBuf {
@@ -155,9 +154,8 @@ impl PageServerNode {
/// The pid file is created by the pageserver process, with its pid stored inside.
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
/// pageserver runs. (Unless someone removes the file manually; never do that!)
fn pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(self.repo_path().join("pageserver.pid"))
.expect("non-Unicode path")
fn pid_file(&self) -> PathBuf {
self.repo_path().join("pageserver.pid")
}
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {

View File

@@ -11,7 +11,6 @@ use std::process::Child;
use std::{io, result};
use anyhow::Context;
use camino::Utf8PathBuf;
use postgres_connection::PgConnectionConfig;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
@@ -98,9 +97,8 @@ impl SafekeeperNode {
SafekeeperNode::datadir_path_by_id(&self.env, self.id)
}
pub fn pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(self.datadir_path().join("safekeeper.pid"))
.expect("non-Unicode path")
pub fn pid_file(&self) -> PathBuf {
self.datadir_path().join("safekeeper.pid")
}
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {

View File

@@ -23,7 +23,7 @@ vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
ignore = ["RUSTSEC-2023-0052"]
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:

View File

@@ -0,0 +1,244 @@
# Sharding Phase 1: Static Key-space Sharding
## Summary
To enable databases with sizes approaching the capacity of a pageserver's disk,
it is necessary to break up the storage for the database, or _shard_ it.
Sharding in general is a complex area. This RFC aims to define a modest initial
capability that will permit creating large-capacity databases using a static configuration
defined at time of Tenant creation.
## Motivation
Currently, all data for a Tenant, including all its timelines, is stored on a single
pageserver. The local storage required may be several times larger than the actual
database size, due to LSM write inflation.
If a database is larger than what one pageserver can hold, then it becomes impossible
for the pageserver to hold it in local storage, as it must do to provide service to
clients.
### Prior art
Numerous: sharding is a long-discussed feature for the pageserver.
Prior art in other distributed systems is too broad to capture here: pretty much
any scale out storage system does something like this.
## Requirements
- Enable creating a large (for example, 16TiB) database without requiring dedicated
pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
does not write out a single contiguous ranges of page numbers.
*Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
that a user might create on a current-gen enterprise SSD should also work well on
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
pageserver backend is not the limiting factor in the database size*.
## Non Goals
- Independently distributing timelines within the same tenant. If a tenant has many
timelines, then sharding may be a less efficient mechanism for distributing load than
sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
based on the idea that separate mechanisms will make sense for each dimension.
## Impacted Components
pageserver, control plane, safekeeper (optional)
## Terminology
**Key**: a postgres page number. In the sense that the pageserver is a versioned key-value store,
the page number is the key in that store.
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
of keys and LSNs as a two dimensional space.
## Implementation
### Key sharding vs. LSN sharding
When we think of sharding across the two dimensional key/lsn space, this is an
opportunity to think about how the two dimensions differ:
- Sharding the key space distributes the _write_ workload of ingesting data
and compacting. This work must be carefully managed so that exactly one
node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
can be done by anyone without any special coordination, as long as they can
see the remote index and layers.
The key sharding is the harder part, and also the more urgent one, to support larger
capacity databases. Because distributing historical LSN read work is a relatively
simpler problem that most users don't have, we defer it to future work. It is anticipated
that some quite simple P2P offload model will enable distributing work for historical
reads: a node which is low on space can call out to peer to ask it to download and
serve reads from a historical layer.
### Key mapping scheme
Having decided to focus on key sharding, we must next decide how we will map
keys to shards.
It is proposed to use a "wide striping" approach, to obtain a good compromise
between data locality and avoiding entire large relations mapping to the same shard.
The mapping is quite simple:
- Define a stripe size, such as 256MiB. Map this to a key count, such that a contiguous
range of 256MiB keys would all fall into this stripe, i.e. divide by 8kiB to get 32k.
- Map a key to a stripe by integer division.
- Map a stripe to a shard by taking the shard index modulo the shard count.
This scheme will achieve a good balance as long as there is no aliasing of the keys
to the stripe width. In the example above, if someone had 4 shards and wrote
keys that were all 4*32k apart, they would all map to the same shard. However, we do
not have to worry about this, since end users do not control page numbers: as long as
we do not pick stripe sizes that map to any problematic postgres behaviors, we'll be fine.
### Important Types
#### `ShardMap`
Provides all the information needed to route a request for a particular
key to the correct pageserver:
- Stripe size
- Shard count
- Address of the pageserver hosting each shard
This structure's size is linear with the number of shards.
#### `ShardIdentity`
Provides the information needed to know whether a particular key belongs
to a particular shard:
- Stripe size
- Shard count
- Shard index
This structure's size is constant.
### Pageserver changes
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
TenantShards, which are just a `Tenant` plus a `ShardIdentity` telling it which part
of the keyspace it owns.
When the pageserver subscribes to a safekeeper for WAL updates, it must provide
its `ShardIdentity` to receive the relevant subset of the WAL.
When the pageserver writes layers and index_part.json to remote storage, it must
include the shard index & count in the name, to avoid collisions (the count is
necessary for future-proofing: the count will vary in time). These keys
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
exactly the same for TenantShards as it does for Tenants today: each shard will have
its own generation number.
The pageserver doesn't have to do anything special during ingestion, compaction
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
This will result in sparse layer files, containing keys only in the stripes that this
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
the key range, these should be updated to ignore gaps that are due to sharding, to
avoid spuriously splitting up layers ito stripe-sized pieces.
### Pageserver Controller changes
The pageserver controller is a new component, which is responsible for abstracting
away the business of managing individual tenant placement on pagservers. It will
also act as the abstraction on top of sharding, so that the control plane continue
to see a Tenant as a single object, even though the reality is that it is many
TenantShards.
For the rest of this RFC, think of the Pageserver Controller as a component of
the control plane. The actual implementation is beyond the scope of this RFC
and will be described in more detail elsewhere.
### Safekeeper changes
The safekeeper's API for subscribing to a WAL will be extended to enable callers
to provide a `ShardIdentity`. In this mode it will only send WAL entries that
fall within the keyspace belonging to the shard, and WAL entries that are to
be mirrored to all shards.
Metadata updates describing databases+relations are mirrored to
all shards, and other WAL messages are only provided to the shard
that owns the key being updated. For any operation that updates multiple
keys, it will be provided to all the shards whose key ranges intersect with
one or more of the keys referenced in the WAL message.
### Pageserver Controller
### Endpoints
Compute endpoints will need to:
- Accept a ShardMap as part of their configuration from the control plane
- Route pageserver requests according to that ShardMap
### Control Plane
#### Publishing ShardMap updates
The control plane will provide an API for the pageserver controller to publish updates
to the ShardMap for a tenant. When such an update is provided, it will be used to
update the configuration of any endpoints currently active for the tenant.
The ShardMap will be opaque to the Control Plane: it doesn't need to do anything with it
other than storing and passing on to endpoints.
#### Attaching via the Pageserver Controller
The Control Plane will issue attach/create API calls to the pageserver controller
instead of directly to pageservers. This will relieve the control plane of the need
to know about sharding.
#### Enabling sharding for large tenants
When a Tenant is created, it is up to the control plane to provide a hint to
the pageserver about how large it will be. This may be implemented as a service tier,
where users creating very large databases would be onboarded to the tier, and then
the Tenants they create would be created with a larger number of shards. For the
general population of users we should continue to use 1 shard by default.
## Next Steps
Clearly, the mechanism described in this RFC has substantial limitations:
- A) the number of shards in a tenant is defined at creation time.
- B) data is not distributed across the LSN dimension
To address `A`, a _splitting_ feature will later be added. One shard can split its
data into a number of children by doing a special compaction operation to generate
image layers broken up child-shard-wise, and then writing out an index_part.json for
each child. This will then require coordination with the pageserver controller to
safely attach these new child shards and then move them around to distribute work.
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
once a Tenant has been sharded, there is little value in merging it again.
To address `B`, it is envisaged to have some gossip mechanism for pageservers to communicate
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
ask another to go read the necessary layers from remote storage to serve the read. This
requires relativly little coordination because it is read-only: any node can service any
read. All reads to a particular shard would still flow through one node, but the
disk capactity & I/O impact of servicing the read would be distributed.
## FAQ/Alternatives
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
When a database is growing under a write workload, writes may predominantly hit the
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
is intensively re-writing a particular relation, if that relation lived in a particular
shard then it would not achieve our goal of distributing the write work across shards.
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
Two reasons:
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
database would still cause a load hotspot on the pageserver routing its read requests.
2. Implementing a proxy model as a stop-gap would not be a cheap option, because
it requires making pageservers aware of their peers, and adding synchronisation to
keep pageservers aware of their peers as they come and go.

View File

@@ -0,0 +1,119 @@
# Pageserver Controller Phase 1: Generations
## Summary
In the [generation numbers RFC](025-generation-numbers.md), it was proposed that
the console/control plane would act as the central coordinator for issuing generation
numbers.
That approach has not proven practical, so this RFC proposes an alternative implementation
where generation numbers are managed in a different service.
Calls to generation-aware pageserver APIs like create/attach will call out to this
new _pageserver controller_ to acquire generation numbers. This service will also
form the basis for satisfying future pageserver management requirements, such as
coordinating sharding, doing automatic capacity balancing, and many more.
## Motivation
This is a dependency for delivering high availability.
### Prior art
None
## Requirements
- Provide a hook for the pageserver to use when it receives an attach/create/load API
call, which will yield a generation that is safe for the pageserver to use.
- Implement the /re-attach and /validate APIs required for the generation numbers feature
to work.
## Non Goals
- This is not intended to interact with any components other than the pageserver, or
to integrate with the broader control plane in any way.
## Impacted Components
pageserver, pageserver controller (new)
## Implementation
We may start from the minimal `attachment_service` used in automated tests.
### Data store
For generation numbers, we need a persistent, linearizable data store. Postgres is sufficient for
this: we already have postgres instances used for other control plane work.
The storage for the Pageserver Controller will be independent of other components:
it might use the same physical database server but would use an independent database.
### Deployment
There will be one instance per region. In future we would aim to define the concept
of a pageserver cluster and have one controller per cluster, but in the short term
one per region will be functionally okay for current scale.
The pageserver controller will be deployed within kubernetes, in the same way as
the storage broker (which is currently via a [helm chart](https://github.com/neondatabase/helm-charts/tree/main/charts/neon-storage-broker)).
### Security
The pageserver controller's API will do authentication with JWT, the same as
the pageserver's existing API.
### Correctness
It is essential that pageservers call into the controller at the _very start_ of
handling attach/create/load API requests. They should not do any work at all until
they have acquired that generation number.
If the call fails, they must retry: it is not safe to proceed without a generation number.
## Future
Having a call chain that goes `Control plane -> Pageserver -> Pageserver controller`
is clearly a little strange: we are only doing this to avoid needing to make changes
to the control plane.
In future, we will change the control plane to call directly into the pageserver
controller, which would then call onwards into the pageserver. This would be a fairly
small change to the controller, since all the logic around storing and updating
generation numbers would stay the same: just the behavior of the API frontend
would be different.
The work to enable pageservers to communicate with the controller is not wasted,
because they still communicate in that direction when invoking `/re-attach`
and `/validate`
## Alternatives considered
### Run in the console/control plane codebase
The control plane is a large Go codebase that uses extensive code generation, and
has to be quite generic to manage many different types of component.
### Direct DB access
We could have pageservers call directly into a shared database to acquire and update
generation numbers (with carefully crafted transactions to protect against concurrent
attaches getting the same generation, etc).
Pros:
- No extra service required, simpler deployment
Cons:
- No future path to a cleaner architecture: the pageserver controller can be implemented
as an extensible place for implement more functionality in future, whereas a mechanism
to do generation numbers via SQL queries from the pageserver would be specialized
and the code would probably be disposed of in the relatively near future.
- Puts onus entirely on SQL query correctness to mediate concurrent access.
The pageserver controller also has to be correct in this respect in case there
is more than one instance running, but it is much less likely to hit this path,
so the overall risk of issues is lower when using a central service.
The main downside to that approach is that it doesn't provide the future path that
the pageserver controller does

View File

@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize, serde::Deserialize)]
#[derive(serde::Serialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
}

View File

@@ -10,7 +10,6 @@ use serde_with::{serde_as, DisplayFromStr};
use strum_macros;
use utils::{
completion,
generation::Generation,
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
@@ -219,8 +218,6 @@ impl std::ops::Deref for TenantCreateRequest {
}
}
/// An alternative representation of `pageserver::tenant::TenantConf` with
/// simpler types.
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantConfig {
pub checkpoint_distance: Option<u64>,
@@ -246,39 +243,6 @@ pub struct TenantConfig {
pub gc_feedback: Option<bool>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
#[derive(Serialize, Deserialize, Debug)]
pub enum LocationConfigMode {
AttachedSingle,
AttachedMulti,
AttachedStale,
Secondary,
Detached,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct LocationConfigSecondary {
pub warm: bool,
}
/// An alternative representation of `pageserver::tenant::LocationConf`,
/// for use in external-facing APIs.
#[derive(Serialize, Deserialize, Debug)]
pub struct LocationConfig {
pub mode: LocationConfigMode,
/// If attaching, in what generation?
#[serde(default)]
pub generation: Option<Generation>,
#[serde(default)]
pub secondary_conf: Option<LocationConfigSecondary>,
// If requesting mode `Secondary`, configuration for that.
// Custom storage configuration for the tenant, if any
pub tenant_conf: TenantConfig,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
@@ -289,16 +253,6 @@ pub struct StatusResponse {
pub id: NodeId,
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
@@ -409,15 +363,8 @@ pub struct TimelineInfo {
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
/// The LSN that we are advertizing to safekeepers
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.

View File

@@ -442,20 +442,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
trace!("got message {:?}", msg);
let result = self.process_message(handler, msg, &mut query_string).await;
tokio::select!(
biased;
_ = shutdown_watcher() => {
// We were requested to shut down.
tracing::info!("shutdown request received during response flush");
return Ok(())
},
flush_r = self.flush() => {
flush_r?;
}
);
self.flush().await?;
match result? {
ProcessMsgResult::Continue => {
self.flush().await?;
continue;
}
ProcessMsgResult::Break => break,

View File

@@ -12,7 +12,7 @@ log.workspace = true
once_cell.workspace = true
postgres.workspace = true
postgres_ffi.workspace = true
camino-tempfile.workspace = true
tempfile.workspace = true
workspace_hack.workspace = true

View File

@@ -1,5 +1,4 @@
use anyhow::{bail, ensure};
use camino_tempfile::{tempdir, Utf8TempDir};
use log::*;
use postgres::types::PgLsn;
use postgres::Client;
@@ -9,6 +8,7 @@ use std::cmp::Ordering;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
use tempfile::{tempdir, TempDir};
macro_rules! xlog_utils_test {
($version:ident) => {
@@ -33,7 +33,7 @@ pub struct Conf {
pub struct PostgresServer {
process: std::process::Child,
_unix_socket_dir: Utf8TempDir,
_unix_socket_dir: TempDir,
client_config: postgres::Config,
}

View File

@@ -13,7 +13,6 @@ aws-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
camino.workspace = true
hyper = { workspace = true, features = ["stream"] }
serde.workspace = true
serde_json.workspace = true
@@ -28,6 +27,5 @@ pin-project-lite.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
tempfile.workspace = true
test-context.workspace = true
rand.workspace = true

View File

@@ -13,14 +13,13 @@ use std::{
collections::HashMap,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf},
pin::Pin,
sync::Arc,
};
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
use serde::{Deserialize, Serialize};
use tokio::io;
use toml_edit::Item;
use tracing::info;
@@ -43,44 +42,22 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
/// As defined in S3 docs
pub const MAX_KEYS_PER_DELETE: usize = 1000;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
/// Path on the remote storage, relative to some inner prefix.
/// The prefix is an implementation detail, that allows representing local paths
/// as the remote ones, stripping the local storage prefix away.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(Utf8PathBuf);
impl Serialize for RemotePath {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl<'de> Deserialize<'de> for RemotePath {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let str = String::deserialize(deserializer)?;
Ok(Self(Utf8PathBuf::from(&str)))
}
}
pub struct RemotePath(PathBuf);
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Display::fmt(&self.0, f)
write!(f, "{}", self.0.display())
}
}
impl RemotePath {
pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
anyhow::ensure!(
relative_path.is_relative(),
"Path {relative_path:?} is not relative"
@@ -89,31 +66,27 @@ impl RemotePath {
}
pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
Self::new(Utf8Path::new(relative_path))
Self::new(Path::new(relative_path))
}
pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf {
pub fn with_base(&self, base_path: &Path) -> PathBuf {
base_path.join(&self.0)
}
pub fn object_name(&self) -> Option<&str> {
self.0.file_name()
self.0.file_name().and_then(|os_str| os_str.to_str())
}
pub fn join(&self, segment: &Utf8Path) -> Self {
pub fn join(&self, segment: &Path) -> Self {
Self(self.0.join(segment))
}
pub fn get_path(&self) -> &Utf8PathBuf {
pub fn get_path(&self) -> &PathBuf {
&self.0
}
pub fn extension(&self) -> Option<&str> {
self.0.extension()
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
self.0.extension()?.to_str()
}
}
@@ -311,7 +284,7 @@ impl GenericRemoteStorage {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs(root) => {
info!("Using fs root '{root}' as a remote storage");
info!("Using fs root '{}' as a remote storage", root.display());
Self::LocalFs(LocalFs::new(root.clone())?)
}
RemoteStorageKind::AwsS3(s3_config) => {
@@ -379,7 +352,7 @@ pub struct RemoteStorageConfig {
pub enum RemoteStorageKind {
/// Storage based on local file system.
/// Specify a root folder to place all stored files into.
LocalFs(Utf8PathBuf),
LocalFs(PathBuf),
/// AWS S3 based storage, storing all files in the S3 bucket
/// specified by the config
AwsS3(S3Config),
@@ -474,7 +447,7 @@ impl RemoteStorageConfig {
concurrency_limit,
max_keys_per_list_response,
}),
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
parse_toml_string("local_path", local_path)?,
)),
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
@@ -519,23 +492,23 @@ mod tests {
#[test]
fn test_object_name() {
let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
let k = RemotePath::new(Path::new("a/b/c")).unwrap();
assert_eq!(k.object_name(), Some("c"));
let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap();
let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
assert_eq!(k.object_name(), Some("c"));
let k = RemotePath::new(Utf8Path::new("a/")).unwrap();
let k = RemotePath::new(Path::new("a/")).unwrap();
assert_eq!(k.object_name(), Some("a"));
// XXX is it impossible to have an empty key?
let k = RemotePath::new(Utf8Path::new("")).unwrap();
let k = RemotePath::new(Path::new("")).unwrap();
assert_eq!(k.object_name(), None);
}
#[test]
fn rempte_path_cannot_be_created_from_absolute_ones() {
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
assert_eq!(err.to_string(), "Path \"/\" is not relative");
}
}

View File

@@ -4,10 +4,15 @@
//! This storage used in tests, but can also be used in cases when a certain persistent
//! volume is mounted to the local FS.
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
use std::{
borrow::Cow,
future::Future,
io::ErrorKind,
path::{Path, PathBuf},
pin::Pin,
};
use anyhow::{bail, ensure, Context};
use camino::{Utf8Path, Utf8PathBuf};
use tokio::{
fs,
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
@@ -23,20 +28,20 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
#[derive(Debug, Clone)]
pub struct LocalFs {
storage_root: Utf8PathBuf,
storage_root: PathBuf,
}
impl LocalFs {
/// Attempts to create local FS storage, along with its root directory.
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
if !storage_root.exists() {
std::fs::create_dir_all(&storage_root).with_context(|| {
format!("Failed to create all directories in the given root path {storage_root:?}")
})?;
}
if !storage_root.is_absolute() {
storage_root = storage_root.canonicalize_utf8().with_context(|| {
storage_root = storage_root.canonicalize().with_context(|| {
format!("Failed to represent path {storage_root:?} as an absolute path")
})?;
}
@@ -45,7 +50,7 @@ impl LocalFs {
}
// mirrors S3Bucket::s3_object_to_relative_path
fn local_file_to_relative_path(&self, key: Utf8PathBuf) -> RemotePath {
fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
let relative_path = key
.strip_prefix(&self.storage_root)
.expect("relative path must contain storage_root as prefix");
@@ -54,18 +59,22 @@ impl LocalFs {
async fn read_storage_metadata(
&self,
file_path: &Utf8Path,
file_path: &Path,
) -> anyhow::Result<Option<StorageMetadata>> {
let metadata_path = storage_metadata_path(file_path);
if metadata_path.exists() && metadata_path.is_file() {
let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
format!("Failed to read metadata from the local storage at '{metadata_path}'")
format!(
"Failed to read metadata from the local storage at '{}'",
metadata_path.display()
)
})?;
serde_json::from_str(&metadata_string)
.with_context(|| {
format!(
"Failed to deserialize metadata from the local storage at '{metadata_path}'",
"Failed to deserialize metadata from the local storage at '{}'",
metadata_path.display()
)
})
.map(|metadata| Some(StorageMetadata(metadata)))
@@ -162,21 +171,25 @@ impl RemoteStorage for LocalFs {
}
}
// Note that Utf8PathBuf starts_with only considers full path segments, but
// Note that PathBuf starts_with only considers full path segments, but
// object prefixes are arbitrary strings, so we need the strings for doing
// starts_with later.
let prefix = full_path.as_str();
let prefix = full_path.to_string_lossy();
let mut files = vec![];
let mut directory_queue = vec![initial_dir];
let mut directory_queue = vec![initial_dir.clone()];
while let Some(cur_folder) = directory_queue.pop() {
let mut entries = cur_folder.read_dir_utf8()?;
while let Some(Ok(entry)) = entries.next() {
let file_name = entry.file_name();
let full_file_name = cur_folder.join(file_name);
if full_file_name.as_str().starts_with(prefix) {
let mut entries = fs::read_dir(cur_folder.clone()).await?;
while let Some(entry) = entries.next_entry().await? {
let file_name: PathBuf = entry.file_name().into();
let full_file_name = cur_folder.clone().join(&file_name);
if full_file_name
.to_str()
.map(|s| s.starts_with(prefix.as_ref()))
.unwrap_or(false)
{
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
files.push(file_remote_path);
files.push(file_remote_path.clone());
if full_file_name.is_dir() {
directory_queue.push(full_file_name);
}
@@ -217,7 +230,10 @@ impl RemoteStorage for LocalFs {
.open(&temp_file_path)
.await
.with_context(|| {
format!("Failed to open target fs destination at '{target_file_path}'")
format!(
"Failed to open target fs destination at '{}'",
target_file_path.display()
)
})?,
);
@@ -228,7 +244,8 @@ impl RemoteStorage for LocalFs {
.await
.with_context(|| {
format!(
"Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
"Failed to upload file (write temp) to the local storage at '{}'",
temp_file_path.display()
)
})?;
@@ -245,7 +262,8 @@ impl RemoteStorage for LocalFs {
destination.flush().await.with_context(|| {
format!(
"Failed to upload (flush temp) file to the local storage at '{temp_file_path}'",
"Failed to upload (flush temp) file to the local storage at '{}'",
temp_file_path.display()
)
})?;
@@ -253,7 +271,8 @@ impl RemoteStorage for LocalFs {
.await
.with_context(|| {
format!(
"Failed to upload (rename) file to the local storage at '{target_file_path}'",
"Failed to upload (rename) file to the local storage at '{}'",
target_file_path.display()
)
})?;
@@ -267,7 +286,8 @@ impl RemoteStorage for LocalFs {
.await
.with_context(|| {
format!(
"Failed to write metadata to the local storage at '{storage_metadata_path}'",
"Failed to write metadata to the local storage at '{}'",
storage_metadata_path.display()
)
})?;
}
@@ -373,16 +393,16 @@ impl RemoteStorage for LocalFs {
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
fn storage_metadata_path(original_path: &Path) -> PathBuf {
path_with_suffix_extension(original_path, "metadata")
}
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
P: AsRef<Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
@@ -392,13 +412,7 @@ where
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
let entry_path = dir_entry.path();
if file_type.is_symlink() {
debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
@@ -421,10 +435,13 @@ where
})
}
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
let target_dir = match target_file_path.parent() {
Some(parent_dir) => parent_dir,
None => bail!("File path '{target_file_path}' has no parent directory"),
None => bail!(
"File path '{}' has no parent directory",
target_file_path.display()
),
};
if !target_dir.exists() {
fs::create_dir_all(target_dir).await?;
@@ -432,9 +449,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
Ok(())
}
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
if file_path.exists() {
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
ensure!(
file_path.is_file(),
"file path '{}' is not a file",
file_path.display()
);
Ok(true)
} else {
Ok(false)
@@ -445,13 +466,13 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use camino_tempfile::tempdir;
use std::{collections::HashMap, io::Write};
use tempfile::tempdir;
async fn read_and_assert_remote_file_contents(
storage: &LocalFs,
#[allow(clippy::ptr_arg)]
// have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
// have to use &PathBuf due to `storage.local_path` parameter requirements
remote_storage_path: &RemotePath,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
@@ -498,7 +519,7 @@ mod fs_tests {
async fn upload_file_negatives() -> anyhow::Result<()> {
let storage = create_storage()?;
let id = RemotePath::new(Utf8Path::new("dummy"))?;
let id = RemotePath::new(Path::new("dummy"))?;
let content = std::io::Cursor::new(b"12345");
// Check that you get an error if the size parameter doesn't match the actual
@@ -523,8 +544,7 @@ mod fs_tests {
}
fn create_storage() -> anyhow::Result<LocalFs> {
let storage_root = tempdir()?.path().to_path_buf();
LocalFs::new(storage_root)
LocalFs::new(tempdir()?.path().to_owned())
}
#[tokio::test]
@@ -541,7 +561,7 @@ mod fs_tests {
);
let non_existing_path = "somewhere/else";
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
}
@@ -755,7 +775,7 @@ mod fs_tests {
}
async fn create_file_for_upload(
path: &Utf8Path,
path: &Path,
contents: &str,
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
std::fs::create_dir_all(path.parent().unwrap())?;

View File

@@ -33,10 +33,11 @@ use tracing::debug;
use super::StorageMetadata;
use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics;
use self::metrics::{AttemptOutcome, RequestKind};
@@ -47,47 +48,10 @@ pub struct S3Bucket {
bucket_name: String,
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
concurrency_limiter: ConcurrencyLimiter,
}
struct ConcurrencyLimiter {
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
// The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>,
read: Arc<Semaphore>,
}
impl ConcurrencyLimiter {
fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
match kind {
RequestKind::Get => &self.read,
RequestKind::Put => &self.write,
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
}
}
async fn acquire(
&self,
kind: RequestKind,
) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
self.for_kind(kind).acquire().await
}
async fn acquire_owned(
&self,
kind: RequestKind,
) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
Arc::clone(self.for_kind(kind)).acquire_owned().await
}
fn new(limit: usize) -> ConcurrencyLimiter {
Self {
read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)),
}
}
concurrency_limiter: Arc<Semaphore>,
}
#[derive(Default)]
@@ -154,7 +118,7 @@ impl S3Bucket {
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
})
}
@@ -180,11 +144,12 @@ impl S3Bucket {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path
.get_path()
.as_str()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
.to_string_lossy()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
.to_string();
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + path_string,
None => path_string.to_string(),
Some(prefix) => prefix.clone() + "/" + &path_string,
None => path_string,
}
}
@@ -192,7 +157,7 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
.acquire(kind)
.acquire()
.await
.expect("semaphore is never closed");
@@ -208,7 +173,8 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
.acquire_owned(kind)
.clone()
.acquire_owned()
.await
.expect("semaphore is never closed");
@@ -534,7 +500,7 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
let started_at = start_measuring_requests(kind);
let resp = self
@@ -600,8 +566,8 @@ fn start_measuring_requests(
#[cfg(test)]
mod tests {
use camino::Utf8Path;
use std::num::NonZeroUsize;
use std::path::Path;
use crate::{RemotePath, S3Bucket, S3Config};
@@ -610,7 +576,7 @@ mod tests {
let all_paths = ["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
.map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
.collect();
let prefixes = [
None,

View File

@@ -2,12 +2,11 @@ use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use camino::Utf8Path;
use once_cell::sync::OnceCell;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -56,7 +55,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
let test_client = Arc::clone(&ctx.enabled.client);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None)
@@ -109,7 +108,7 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
};
let test_client = Arc::clone(&ctx.enabled.client);
let base_prefix =
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list_files(None)
.await
@@ -130,9 +129,9 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
let trim_remote_blobs: HashSet<_> = ctx
.remote_blobs
.iter()
.map(|x| x.get_path())
.map(|x| x.get_path().to_str().expect("must be valid name"))
.filter(|x| x.starts_with("folder1"))
.map(|x| RemotePath::new(x).expect("must be valid path"))
.map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
.collect();
assert_eq!(
nested_remote_files, trim_remote_blobs,
@@ -149,9 +148,10 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
MaybeEnabledS3::Disabled => return Ok(()),
};
let path = RemotePath::new(Utf8Path::new(
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
))
let path = RemotePath::new(&PathBuf::from(format!(
"{}/for_sure_there_is_nothing_there_really",
ctx.base_prefix,
)))
.with_context(|| "RemotePath conversion")?;
ctx.client.delete(&path).await.expect("should succeed");
@@ -167,13 +167,13 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
MaybeEnabledS3::Disabled => return Ok(()),
};
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let data1 = "remote blob data1".as_bytes();
@@ -378,30 +378,21 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
// due to how time works, we've had test runners use the same nanos as bucket prefixes.
// millis is just a debugging aid for easier finding the prefix later.
let millis = std::time::SystemTime::now()
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
// because nanos can be the same for two threads so can millis, add randomness
let random = rand::thread_rng().gen::<u32>();
.as_nanos();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
@@ -427,10 +418,10 @@ async fn upload_s3_data(
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
let blob_prefix = RemotePath::new(&prefix)
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
@@ -512,10 +503,8 @@ async fn upload_simple_s3_data(
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
let blob_path = RemotePath::new(
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
)
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
let blob_path = RemotePath::new(&blob_path)
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();

View File

@@ -10,7 +10,6 @@ async-trait.workspace = true
anyhow.workspace = true
bincode.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
@@ -54,7 +53,7 @@ byteorder.workspace = true
bytes.workspace = true
criterion.workspace = true
hex-literal.workspace = true
camino-tempfile.workspace = true
tempfile.workspace = true
[[bench]]
name = "benchmarks"

View File

@@ -2,9 +2,9 @@
use serde;
use std::fs;
use std::path::Path;
use anyhow::Result;
use camino::Utf8Path;
use jsonwebtoken::{
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
};
@@ -65,7 +65,7 @@ impl JwtAuth {
}
}
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
pub fn from_key_path(key_path: &Path) -> Result<Self> {
let public_key = fs::read(key_path)?;
Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
}

View File

@@ -1,14 +1,14 @@
use std::{
borrow::Cow,
ffi::OsStr,
fs::{self, File},
io,
path::{Path, PathBuf},
};
use camino::{Utf8Path, Utf8PathBuf};
/// Similar to [`std::fs::create_dir`], except we fsync the
/// created directory and its parent.
pub fn create_dir(path: impl AsRef<Utf8Path>) -> io::Result<()> {
pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
let path = path.as_ref();
fs::create_dir(path)?;
@@ -18,7 +18,7 @@ pub fn create_dir(path: impl AsRef<Utf8Path>) -> io::Result<()> {
/// Similar to [`std::fs::create_dir_all`], except we fsync all
/// newly created directories and the pre-existing parent.
pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
let mut path = path.as_ref();
let mut dirs_to_create = Vec::new();
@@ -30,7 +30,7 @@ pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
Ok(_) => {
return Err(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("non-directory found in path: {path}"),
format!("non-directory found in path: {}", path.display()),
));
}
Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
@@ -44,7 +44,7 @@ pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
None => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("can't find parent of path '{path}'"),
format!("can't find parent of path '{}'", path.display()).as_str(),
));
}
}
@@ -70,18 +70,21 @@ pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension,
/// or if there's no extension, creates one and puts a suffix there.
pub fn path_with_suffix_extension(
original_path: impl AsRef<Utf8Path>,
suffix: &str,
) -> Utf8PathBuf {
let new_extension = match original_path.as_ref().extension() {
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
let new_extension = match original_path
.as_ref()
.extension()
.map(OsStr::to_string_lossy)
{
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
None => Cow::Borrowed(suffix),
};
original_path.as_ref().with_extension(new_extension)
original_path
.as_ref()
.with_extension(new_extension.as_ref())
}
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
let parent = file_path.parent().ok_or_else(|| {
io::Error::new(
io::ErrorKind::Other,
@@ -94,7 +97,7 @@ pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
Ok(())
}
pub fn fsync(path: &Utf8Path) -> io::Result<()> {
pub fn fsync(path: &Path) -> io::Result<()> {
File::open(path)
.map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
.and_then(|file| {
@@ -108,18 +111,19 @@ pub fn fsync(path: &Utf8Path) -> io::Result<()> {
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
}
pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Error> {
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
tokio::fs::File::open(path).await?.sync_all().await
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;
use super::*;
#[test]
fn test_create_dir_fsyncd() {
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempdir().unwrap();
let existing_dir_path = dir.path();
let err = create_dir(existing_dir_path).unwrap_err();
@@ -135,7 +139,7 @@ mod tests {
#[test]
fn test_create_dir_all_fsyncd() {
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempdir().unwrap();
let existing_dir_path = dir.path();
create_dir_all(existing_dir_path).unwrap();
@@ -162,29 +166,29 @@ mod tests {
#[test]
fn test_path_with_suffix_extension() {
let p = Utf8PathBuf::from("/foo/bar");
let p = PathBuf::from("/foo/bar");
assert_eq!(
&path_with_suffix_extension(p, "temp").to_string(),
&path_with_suffix_extension(p, "temp").to_string_lossy(),
"/foo/bar.temp"
);
let p = Utf8PathBuf::from("/foo/bar");
let p = PathBuf::from("/foo/bar");
assert_eq!(
&path_with_suffix_extension(p, "temp.temp").to_string(),
&path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
"/foo/bar.temp.temp"
);
let p = Utf8PathBuf::from("/foo/bar.baz");
let p = PathBuf::from("/foo/bar.baz");
assert_eq!(
&path_with_suffix_extension(p, "temp.temp").to_string(),
&path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
"/foo/bar.baz.temp.temp"
);
let p = Utf8PathBuf::from("/foo/bar.baz");
let p = PathBuf::from("/foo/bar.baz");
assert_eq!(
&path_with_suffix_extension(p, ".temp").to_string(),
&path_with_suffix_extension(p, ".temp").to_string_lossy(),
"/foo/bar.baz..temp"
);
let p = Utf8PathBuf::from("/foo/bar/dir/");
let p = PathBuf::from("/foo/bar/dir/");
assert_eq!(
&path_with_suffix_extension(p, ".temp").to_string(),
&path_with_suffix_extension(p, ".temp").to_string_lossy(),
"/foo/bar/dir..temp"
);
}

View File

@@ -55,6 +55,8 @@ where
#[cfg(test)]
mod test {
use std::path::PathBuf;
use crate::fs_ext::{is_directory_empty, list_dir};
use super::ignore_absent_files;
@@ -63,7 +65,7 @@ mod test {
fn is_empty_dir() {
use super::PathExt;
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
// test positive case
@@ -73,7 +75,7 @@ mod test {
);
// invoke on a file to ensure it returns an error
let file_path = dir_path.join("testfile");
let file_path: PathBuf = dir_path.join("testfile");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
assert!(file_path.is_empty_dir().is_err());
@@ -85,7 +87,7 @@ mod test {
#[tokio::test]
async fn is_empty_dir_async() {
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
// test positive case
@@ -95,7 +97,7 @@ mod test {
);
// invoke on a file to ensure it returns an error
let file_path = dir_path.join("testfile");
let file_path: PathBuf = dir_path.join("testfile");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
assert!(is_directory_empty(&file_path).await.is_err());
@@ -107,9 +109,10 @@ mod test {
#[test]
fn ignore_absent_files_works() {
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
let file_path = dir.path().join("testfile");
let file_path: PathBuf = dir_path.join("testfile");
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
@@ -123,17 +126,17 @@ mod test {
#[tokio::test]
async fn list_dir_works() {
let dir = camino_tempfile::tempdir().unwrap();
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
assert!(list_dir(dir_path).await.unwrap().is_empty());
let file_path = dir_path.join("testfile");
let file_path: PathBuf = dir_path.join("testfile");
let _ = std::fs::File::create(&file_path).unwrap();
assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
let another_dir_path = dir_path.join("testdir");
let another_dir_path: PathBuf = dir_path.join("testdir");
std::fs::create_dir(another_dir_path).unwrap();
let expected = &["testdir", "testfile"];

View File

@@ -89,22 +89,6 @@ impl Generation {
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn next(&self) -> Generation {
match self {
Self::Valid(n) => Self::Valid(*n + 1),
Self::None => Self::Valid(1),
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn into(self) -> Option<u32> {
if let Self::Valid(v) = self {
Some(v)
} else {
None
}
}
}
impl Serialize for Generation {

View File

@@ -1,9 +1,8 @@
use hyper::{header, Body, Response, StatusCode};
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::error::Error as StdError;
use thiserror::Error;
use tracing::{error, info};
use tracing::error;
#[derive(Debug, Error)]
pub enum ApiError {
@@ -25,12 +24,6 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>),
#[error("Resource temporarily unavailable: {0}")]
ResourceUnavailable(Cow<'static, str>),
#[error("Shutting down")]
ShuttingDown,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -59,14 +52,6 @@ impl ApiError {
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
@@ -116,12 +101,10 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors
match api_error {
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
_ => error!("Error processing HTTP request: {api_error:#}"),
if let ApiError::InternalServerError(_) = api_error {
error!("Error processing HTTP request: {api_error:?}");
} else {
error!("Error processing HTTP request: {api_error:#}");
}
api_error.into_response()

View File

@@ -1,3 +1,4 @@
use std::ffi::OsStr;
use std::{fmt, str::FromStr};
use anyhow::Context;
@@ -214,11 +215,12 @@ pub struct TimelineId(Id);
id_newtype!(TimelineId);
impl TryFrom<Option<&str>> for TimelineId {
impl TryFrom<Option<&OsStr>> for TimelineId {
type Error = anyhow::Error;
fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
value
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| format!("Could not parse timeline id from {:?}", value))

View File

@@ -11,10 +11,10 @@ use std::{
io::{Read, Write},
ops::Deref,
os::unix::prelude::AsRawFd,
path::{Path, PathBuf},
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use nix::{errno::Errno::EAGAIN, fcntl};
use crate::crashsafe;
@@ -23,7 +23,7 @@ use crate::crashsafe;
/// Returned by [`create_exclusive`].
#[must_use]
pub struct UnwrittenLockFile {
path: Utf8PathBuf,
path: PathBuf,
file: fs::File,
}
@@ -60,7 +60,7 @@ impl UnwrittenLockFile {
///
/// It is not an error if the file already exists.
/// It is an error if the file is already locked.
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
let lock_file = fs::OpenOptions::new()
.create(true) // O_CREAT
.write(true)
@@ -101,7 +101,7 @@ pub enum LockFileRead {
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
/// Check the [`LockFileRead`] variants for details.
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
let res = fs::OpenOptions::new().read(true).open(path);
let mut lock_file = match res {
Ok(f) => f,

View File

@@ -216,30 +216,6 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
}
}
/// When you will store a secret but want to make sure it won't
/// be accidentally logged, wrap it in a SecretString, whose Debug
/// implementation does not expose the contents.
#[derive(Clone, Eq, PartialEq)]
pub struct SecretString(String);
impl SecretString {
pub fn get_contents(&self) -> &str {
self.0.as_str()
}
}
impl From<String> for SecretString {
fn from(s: String) -> Self {
Self(s)
}
}
impl std::fmt::Debug for SecretString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[SECRET]")
}
}
#[cfg(test)]
mod tests {
use metrics::{core::Opts, IntCounterVec};

View File

@@ -1,9 +1,9 @@
#![warn(missing_docs)]
use camino::Utf8Path;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::ops::{Add, AddAssign};
use std::path::Path;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
@@ -44,9 +44,11 @@ impl Lsn {
/// Parse an LSN from a filename in the form `0000000000000000`
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
where
F: AsRef<Utf8Path>,
F: AsRef<Path>,
{
Lsn::from_hex(filename.as_ref().as_str())
let filename: &Path = filename.as_ref();
let filename = filename.to_str().ok_or(LsnParseError)?;
Lsn::from_hex(filename)
}
/// Parse an LSN from a string in the form `0000000000000000`

View File

@@ -49,10 +49,9 @@
//! At this point, `B` and `C` are running, which is hazardous.
//! Morale of the story: don't unlink pidfiles, ever.
use std::ops::Deref;
use std::{ops::Deref, path::Path};
use anyhow::Context;
use camino::Utf8Path;
use nix::unistd::Pid;
use crate::lock_file::{self, LockFileRead};
@@ -85,7 +84,7 @@ impl Deref for PidFileGuard {
/// The claim ends as soon as the returned guard object is dropped.
/// To maintain the claim for the remaining lifetime of the current process,
/// use [`std::mem::forget`] or similar.
pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
// if any of the next steps fail, we drop the file descriptor and thereby release the lock
let guard = unwritten_lock_file
@@ -133,7 +132,7 @@ pub enum PidFileRead {
///
/// On success, this function returns a [`PidFileRead`].
/// Check its docs for a description of the meaning of its different variants.
pub fn read(pidfile: &Utf8Path) -> anyhow::Result<PidFileRead> {
pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
let ret = match res {
LockFileRead::NotExist => PidFileRead::NotExist,

View File

@@ -58,7 +58,7 @@ where
// to get that.
impl<T: Ord> PartialOrd for Waiter<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
other.wake_num.partial_cmp(&self.wake_num)
}
}

View File

@@ -431,14 +431,14 @@ impl CgroupWatcher {
.context("failed to request upscale")?;
let memory_high =
self.get_memory_high_bytes().context("failed to get memory.high")?;
self.get_high_bytes().context("failed to get memory.high")?;
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
info!(
current_high_bytes = memory_high,
new_high_bytes = new_high,
"updating memory.high"
);
self.set_memory_high_bytes(new_high)
self.set_high_bytes(new_high)
.context("failed to set memory.high")?;
last_memory_high_increase_at = Some(Instant::now());
continue;
@@ -556,6 +556,14 @@ impl CgroupWatcher {
}
}
/// Represents a set of limits we apply to a cgroup to control memory usage.
///
/// Setting these values also affects the thresholds for receiving usage alerts.
#[derive(Debug)]
pub struct MemoryLimits {
pub high: u64,
}
// Methods for manipulating the actual cgroup
impl CgroupWatcher {
/// Get a handle on the freezer subsystem.
@@ -616,29 +624,50 @@ impl CgroupWatcher {
}
/// Set cgroup memory.high threshold.
pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
}
/// Set the cgroup's memory.high to 'max', disabling it.
pub fn unset_memory_high(&self) -> anyhow::Result<()> {
self.set_memory_high_internal(MaxValue::Max)
}
fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
self.memory()
.context("failed to get memory subsystem")?
.set_mem(cgroups_rs::memory::SetMemory {
low: None,
high: Some(value),
high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
min: None,
max: None,
})
.map_err(anyhow::Error::from)
.context("failed to set memory.high")
}
/// Set cgroup memory.high and memory.max.
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
info!(limits.high, path = self.path(), "writing new memory limits",);
self.memory()
.context("failed to get memory subsystem while setting memory limits")?
.set_mem(cgroups_rs::memory::SetMemory {
min: None,
low: None,
high: Some(MaxValue::Value(
u64::min(limits.high, i64::MAX as u64) as i64
)),
max: None,
})
.context("failed to set memory limits")
}
/// Given some amount of available memory, set the desired cgroup memory limits
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
let new_high = self.config.calculate_memory_high_value(available_memory);
let limits = MemoryLimits { high: new_high };
info!(
path = self.path(),
memory = ?limits,
"setting cgroup memory",
);
self.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
Ok(())
}
/// Get memory.high threshold.
pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
let high = self
.memory()
.context("failed to get memory subsystem while getting memory statistics")?

View File

@@ -4,9 +4,9 @@
//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
//! all functionality.
use std::fmt::Debug;
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::{fmt::Debug, mem};
use anyhow::{bail, Context};
use axum::extract::ws::{Message, WebSocket};
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
use crate::cgroup::{CgroupWatcher, Sequenced};
use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
use crate::dispatcher::Dispatcher;
use crate::filecache::{FileCacheConfig, FileCacheState};
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,43 +106,6 @@ impl Runner {
kill,
};
// If we have both the cgroup and file cache integrations enabled, it's possible for
// temporary failures to result in cgroup throttling (from memory.high), that in turn makes
// it near-impossible to connect to the file cache (because it times out). Unfortunately,
// we *do* still want to determine the file cache size before setting the cgroup's
// memory.high, so it's not as simple as just swapping the order.
//
// Instead, the resolution here is that on vm-monitor startup (note: happens on each
// connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
// temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
// of a hacky solution, but helps with reliability.
if let Some(name) = &args.cgroup {
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
// now, and then set limits later.
info!("initializing cgroup");
let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
.context("failed to create cgroup manager")?;
info!("temporarily unsetting memory.high");
// Temporarily un-set cgroup memory.high; see above.
cgroup
.unset_memory_high()
.context("failed to unset memory.high")?;
let cgroup = Arc::new(cgroup);
let cgroup_clone = Arc::clone(&cgroup);
spawn_with_cancel(
token.clone(),
|_| error!("cgroup watcher terminated"),
async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
);
state.cgroup = Some(cgroup);
}
let mut file_cache_reserved_bytes = 0;
let mem = get_total_system_memory();
@@ -156,7 +119,7 @@ impl Runner {
false => FileCacheConfig::default_in_memory(),
};
let mut file_cache = FileCacheState::new(connstr, config, token)
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
.await
.context("failed to create file cache")?;
@@ -189,15 +152,35 @@ impl Runner {
state.filecache = Some(file_cache);
}
if let Some(cgroup) = &state.cgroup {
let available = mem - file_cache_reserved_bytes;
let value = cgroup.config.calculate_memory_high_value(available);
if let Some(name) = &args.cgroup {
let (mut cgroup, cgroup_event_stream) =
CgroupWatcher::new(name.clone(), requesting_send)
.context("failed to create cgroup manager")?;
info!(value, "setting memory.high");
let available = mem - file_cache_reserved_bytes;
cgroup
.set_memory_high_bytes(value)
.context("failed to set cgroup memory.high")?;
.set_memory_limits(available)
.context("failed to set cgroup memory limits")?;
let cgroup = Arc::new(cgroup);
// Some might call this . . . cgroup v2
let cgroup_clone = Arc::clone(&cgroup);
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
cgroup_clone.watch(notified_recv, cgroup_event_stream).await
});
state.cgroup = Some(cgroup);
} else {
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
// This allows us to poll it in `Monitor::run` regardless of whether we
// are managing a cgroup or not. If we don't forget it, all receives will
// immediately return an error because the sender is droped and it will
// claim all select! statements, effectively turning `Monitor::run` into
// `loop { fail to receive }`.
mem::forget(requesting_send);
}
Ok(state)
@@ -274,11 +257,14 @@ impl Runner {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
// new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
// since it is properly initialized in the previous cgroup if let block
let limits = MemoryLimits {
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
// since it is properly initialized in the previous cgroup if let block
high: new_cgroup_mem_high,
};
cgroup
.set_memory_high_bytes(new_cgroup_mem_high)
.context("failed to set cgroup memory.high")?;
.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
let message = format!(
"set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -341,9 +327,12 @@ impl Runner {
name = cgroup.path(),
"updating cgroup memory.high",
);
let limits = MemoryLimits {
high: new_cgroup_mem_high,
};
cgroup
.set_memory_high_bytes(new_cgroup_mem_high)
.context("failed to set cgroup memory.high")?;
.set_limits(&limits)
.context("failed to set file cache size")?;
}
Ok(())
@@ -409,7 +398,7 @@ impl Runner {
}
}
// we need to propagate an upscale request
request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
request = self.dispatcher.request_upscale_events.recv() => {
if request.is_none() {
bail!("failed to listen for upscale event from cgroup")
}

View File

@@ -17,8 +17,6 @@ async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
close_fds.workspace = true
@@ -82,6 +80,7 @@ enum-map.workspace = true
enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true
tempfile.workspace = true
[dev-dependencies]
criterion.workspace = true

View File

@@ -25,7 +25,7 @@ fn redo_scenarios(c: &mut Criterion) {
// input to the stderr.
// utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let repo_dir = tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));

View File

@@ -9,7 +9,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
pageserver = { path = ".." }

View File

@@ -3,14 +3,13 @@
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::ops::Range;
use std::{fs, str};
use std::{fs, path::Path, str};
use pageserver::page_cache::PAGE_SZ;
use pageserver::repository::{Key, KEY_SIZE};
@@ -99,7 +98,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
}
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -168,9 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
parse_filename(&layer.file_name().into_string().unwrap())
{
if layer_file.is_delta {
let layer_path =
Utf8PathBuf::from_path_buf(layer.path()).expect("non-Unicode path");
layer_file.holes = get_holes(&layer_path, max_holes, &ctx).await?;
layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
n_deltas += 1;
}
layers.push(layer_file);

View File

@@ -1,7 +1,6 @@
use std::path::{Path, PathBuf};
use anyhow::Result;
use camino::Utf8Path;
use clap::Subcommand;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
@@ -48,7 +47,7 @@ pub(crate) enum LayerCmd {
}
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);

View File

@@ -8,7 +8,6 @@ mod draw_timeline_dir;
mod layer_map_analyzer;
mod layers;
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use layers::LayerCmd;
use pageserver::{
@@ -19,6 +18,7 @@ use pageserver::{
virtual_file,
};
use postgres_ffi::ControlFileData;
use std::path::{Path, PathBuf};
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
@@ -49,7 +49,7 @@ enum Commands {
#[derive(Parser)]
struct MetadataCmd {
/// Input metadata file path
metadata_path: Utf8PathBuf,
metadata_path: PathBuf,
/// Replace disk consistent Lsn
disk_consistent_lsn: Option<Lsn>,
/// Replace previous record Lsn
@@ -61,13 +61,13 @@ struct MetadataCmd {
#[derive(Parser)]
struct PrintLayerFileCmd {
/// Pageserver data path
path: Utf8PathBuf,
path: PathBuf,
}
#[derive(Parser)]
struct AnalyzeLayerMapCmd {
/// Pageserver data path
path: Utf8PathBuf,
path: PathBuf,
/// Max holes
max_holes: Option<usize>,
}
@@ -102,7 +102,7 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
@@ -114,7 +114,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
Ok(())
}
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);

View File

@@ -2,14 +2,12 @@
use std::env::{var, VarError};
use std::sync::Arc;
use std::{env, ops::ControlFlow, str::FromStr};
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use camino::Utf8Path;
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -22,7 +20,6 @@ use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -66,17 +63,21 @@ fn main() -> anyhow::Result<()> {
let workdir = arg_matches
.get_one::<String>("workdir")
.map(Utf8Path::new)
.unwrap_or_else(|| Utf8Path::new(".neon"));
.map(Path::new)
.unwrap_or_else(|| Path::new(".neon"));
let workdir = workdir
.canonicalize_utf8()
.with_context(|| format!("Error opening workdir '{workdir}'"))?;
.canonicalize()
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
let cfg_file_path = workdir.join("pageserver.toml");
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir)
.with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
env::set_current_dir(&workdir).with_context(|| {
format!(
"Failed to set application's current dir to '{}'",
workdir.display()
)
})?;
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
ControlFlow::Continue(conf) => conf,
@@ -112,8 +113,12 @@ fn main() -> anyhow::Result<()> {
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
utils::crashsafe::create_dir_all(conf.tenants_path())
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
format!(
"Failed to create tenants root dir at '{}'",
tenants_path.display()
)
})?;
}
// Initialize up failpoints support
@@ -130,9 +135,9 @@ fn main() -> anyhow::Result<()> {
}
fn initialize_config(
cfg_file_path: &Utf8Path,
cfg_file_path: &Path,
arg_matches: clap::ArgMatches,
workdir: &Utf8Path,
workdir: &Path,
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
let init = arg_matches.get_flag("init");
let update_config = init || arg_matches.get_flag("update-config");
@@ -140,22 +145,33 @@ fn initialize_config(
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
if init {
anyhow::bail!(
"Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
"Config file '{}' already exists, cannot init it, use --update-config to update it",
cfg_file_path.display()
);
}
// Supplement the CLI arguments with the config file
let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
.with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
format!(
"Failed to read pageserver config at '{}'",
cfg_file_path.display()
)
})?;
(
cfg_file_contents
.parse::<toml_edit::Document>()
.with_context(|| {
format!("Failed to parse '{cfg_file_path}' as pageserver config")
format!(
"Failed to parse '{}' as pageserver config",
cfg_file_path.display()
)
})?,
true,
)
} else if cfg_file_path.exists() {
anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
anyhow::bail!(
"Config file '{}' exists but is not a regular file",
cfg_file_path.display()
);
} else {
// We're initializing the tenant, so there's no config file yet
(
@@ -174,7 +190,7 @@ fn initialize_config(
for (key, item) in doc.iter() {
if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display());
}
toml.insert(key, item.clone());
}
@@ -186,11 +202,18 @@ fn initialize_config(
.context("Failed to parse pageserver configuration")?;
if update_config {
info!("Writing pageserver config to '{cfg_file_path}'");
info!("Writing pageserver config to '{}'", cfg_file_path.display());
std::fs::write(cfg_file_path, toml.to_string())
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
info!("Config successfully written to '{cfg_file_path}'")
std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
format!(
"Failed to write pageserver config to '{}'",
cfg_file_path.display()
)
})?;
info!(
"Config successfully written to '{}'",
cfg_file_path.display()
)
}
Ok(if init {
@@ -323,22 +346,9 @@ fn start_pageserver(
}
};
// Top-level cancellation token for the process
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
remote_storage.clone(),
ControlPlaneClient::new(conf, &shutdown_pageserver),
conf,
);
if let Some(deletion_workers) = deletion_workers {
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
}
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
@@ -369,13 +379,13 @@ fn start_pageserver(
};
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client,
},
order,
shutdown_pageserver.clone(),
@@ -471,10 +481,9 @@ fn start_pageserver(
http::routes::State::new(
conf,
http_auth.clone(),
remote_storage.clone(),
remote_storage,
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
)
.context("Failed to initialize router state")?,
);
@@ -602,12 +611,7 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}
})
@@ -619,7 +623,7 @@ fn create_remote_storage_client(
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
tracing::warn!("no remote storage configured, this is a deprecated configuration");
// No remote storage configured.
return Ok(None);
};

View File

@@ -8,22 +8,20 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde::de::IntoDeserializer;
use std::env;
use std::ops::Deref;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
use reqwest::Url;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use toml_edit;
use toml_edit::{Document, Item};
use camino::{Utf8Path, Utf8PathBuf};
use postgres_backend::AuthType;
use utils::{
id::{NodeId, TenantId, TimelineId},
@@ -38,8 +36,8 @@ use crate::tenant::{
TIMELINES_SEGMENT_NAME,
};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
TIMELINE_UNINIT_MARK_SUFFIX,
};
pub mod defaults {
@@ -154,9 +152,9 @@ pub struct PageServerConf {
// that during unit testing, because the current directory is global
// to the process but different unit tests work on different
// repositories.
pub workdir: PageserverConfWorkdir,
pub workdir: PathBuf,
pub pg_distrib_dir: Utf8PathBuf,
pub pg_distrib_dir: PathBuf,
// Authentication
/// authentication method for the HTTP mgmt API
@@ -165,7 +163,7 @@ pub struct PageServerConf {
pub pg_auth_type: AuthType,
/// Path to a file containing public key for verifying JWT tokens.
/// Used for both mgmt and compute auth, if enabled.
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
pub auth_validation_public_key_path: Option<PathBuf>,
pub remote_storage_config: Option<RemoteStorageConfig>,
@@ -209,13 +207,6 @@ pub struct PageServerConf {
pub background_task_maximum_delay: Duration,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
pub control_plane_api_token: Option<SecretString>,
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -242,45 +233,6 @@ impl<T> BuilderValue<T> {
}
}
#[derive(Clone, PartialEq, Eq)]
pub struct PageserverConfWorkdir {
d: Utf8PathBuf,
}
impl Deref for PageServerConf {
type Target = PageserverConfWorkdir;
fn deref(&self) -> &Self::Target {
&self.workdir
}
}
impl Deref for PageserverConfWorkdir {
type Target = Utf8Path;
fn deref(&self) -> &Self::Target {
&self.d
}
}
impl AsRef<std::path::Path> for PageserverConfWorkdir {
fn as_ref(&self) -> &std::path::Path {
self.d.as_ref()
}
}
impl std::fmt::Debug for PageserverConfWorkdir {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.d.fmt(f)
}
}
impl std::fmt::Display for PageserverConfWorkdir {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.d.fmt(f)
}
}
// needed to simplify config construction
struct PageServerConfigBuilder {
listen_pg_addr: BuilderValue<String>,
@@ -297,15 +249,15 @@ struct PageServerConfigBuilder {
page_cache_size: BuilderValue<usize>,
max_file_descriptors: BuilderValue<usize>,
workdir: BuilderValue<PageserverConfWorkdir>,
workdir: BuilderValue<PathBuf>,
pg_distrib_dir: BuilderValue<Utf8PathBuf>,
pg_distrib_dir: BuilderValue<PathBuf>,
http_auth_type: BuilderValue<AuthType>,
pg_auth_type: BuilderValue<AuthType>,
//
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
id: BuilderValue<NodeId>,
@@ -331,8 +283,6 @@ struct PageServerConfigBuilder {
background_task_maximum_delay: BuilderValue<Duration>,
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
}
impl Default for PageServerConfigBuilder {
@@ -350,14 +300,10 @@ impl Default for PageServerConfigBuilder {
superuser: Set(DEFAULT_SUPERUSER.to_string()),
page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
workdir: Set(PageserverConfWorkdir {
d: Utf8PathBuf::new(),
}),
pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
env::current_dir().expect("cannot access current directory"),
)
.expect("non-Unicode path")
.join("pg_install")),
workdir: Set(PathBuf::new()),
pg_distrib_dir: Set(env::current_dir()
.expect("cannot access current directory")
.join("pg_install")),
http_auth_type: Set(AuthType::Trust),
pg_auth_type: Set(AuthType::Trust),
auth_validation_public_key_path: Set(None),
@@ -401,8 +347,6 @@ impl Default for PageServerConfigBuilder {
.unwrap()),
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
}
}
}
@@ -440,11 +384,11 @@ impl PageServerConfigBuilder {
self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
}
pub fn workdir(&mut self, workdir: Utf8PathBuf) {
self.workdir = BuilderValue::Set(PageserverConfWorkdir { d: workdir })
pub fn workdir(&mut self, workdir: PathBuf) {
self.workdir = BuilderValue::Set(workdir)
}
pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) {
self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
}
@@ -458,7 +402,7 @@ impl PageServerConfigBuilder {
pub fn auth_validation_public_key_path(
&mut self,
auth_validation_public_key_path: Option<Utf8PathBuf>,
auth_validation_public_key_path: Option<PathBuf>,
) {
self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
}
@@ -531,16 +475,8 @@ impl PageServerConfigBuilder {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn control_plane_api(&mut self, api: Option<Url>) {
self.control_plane_api = BuilderValue::Set(api)
}
pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
self.control_plane_api_token = BuilderValue::Set(token)
}
pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
pub fn control_plane_api(&mut self, api: Url) {
self.control_plane_api = BuilderValue::Set(Some(api))
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -631,78 +567,43 @@ impl PageServerConfigBuilder {
control_plane_api: self
.control_plane_api
.ok_or(anyhow!("missing control_plane_api"))?,
control_plane_api_token: self
.control_plane_api_token
.ok_or(anyhow!("missing control_plane_api_token"))?,
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
})
}
}
impl PageserverConfWorkdir {
impl PageServerConf {
//
// Repository paths, relative to workdir.
//
pub fn tenants_path(&self) -> Utf8PathBuf {
self.d.join(TENANTS_SEGMENT_NAME)
pub fn tenants_path(&self) -> PathBuf {
self.workdir.join(TENANTS_SEGMENT_NAME)
}
pub fn deletion_prefix(&self) -> Utf8PathBuf {
self.d.join("deletion")
}
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix()
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
}
pub fn deletion_header_path(&self) -> Utf8PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_ATTACHING_MARKER_FILENAME)
}
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
///
/// Legacy: superseded by tenant_location_config_path. Eventually
/// remove this function.
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
}
pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_LOCATION_CONFIG_NAME)
}
pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
}
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
self.timelines_path(tenant_id).join(timeline_id.to_string())
}
@@ -710,7 +611,7 @@ impl PageserverConfWorkdir {
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_UNINIT_MARK_SUFFIX,
@@ -721,20 +622,20 @@ impl PageserverConfWorkdir {
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_DELETED_MARKER_FILE_NAME)
}
pub fn traces_path(&self) -> Utf8PathBuf {
self.d.join("traces")
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}
pub fn trace_path(
@@ -742,7 +643,7 @@ impl PageserverConfWorkdir {
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> Utf8PathBuf {
) -> PathBuf {
self.traces_path()
.join(tenant_id.to_string())
.join(timeline_id.to_string())
@@ -751,22 +652,20 @@ impl PageserverConfWorkdir {
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
self.timeline_path(tenant_id, timeline_id)
.join(METADATA_FILE_NAME)
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
remote_path.with_base(&self.d)
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
remote_path.with_base(&self.workdir)
}
}
impl PageServerConf {
//
// Postgres distribution paths
//
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
let path = self.pg_distrib_dir.clone();
#[allow(clippy::manual_range_patterns)]
@@ -776,10 +675,10 @@ impl PageServerConf {
}
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
}
@@ -787,7 +686,7 @@ impl PageServerConf {
/// validating the input and failing on errors.
///
/// This leaves any options not present in the file in the built-in defaults.
pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::default();
builder.workdir(workdir.to_owned());
@@ -806,10 +705,10 @@ impl PageServerConf {
builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
}
"pg_distrib_dir" => {
builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?))
}
"auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
Utf8PathBuf::from(parse_toml_string(key, item)?),
PathBuf::from(parse_toml_string(key, item)?),
)),
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
@@ -848,26 +747,7 @@ impl PageServerConf {
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
"control_plane_api" => {
let parsed = parse_toml_string(key, item)?;
if parsed.is_empty() {
builder.control_plane_api(None)
} else {
builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
}
},
"control_plane_api_token" => {
let parsed = parse_toml_string(key, item)?;
if parsed.is_empty() {
builder.control_plane_api_token(None)
} else {
builder.control_plane_api_token(Some(parsed.into()))
}
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -881,7 +761,8 @@ impl PageServerConf {
ensure!(
auth_validation_public_key_path.exists(),
format!(
"Can't find auth_validation_public_key at '{auth_validation_public_key_path}'",
"Can't find auth_validation_public_key at '{}'",
auth_validation_public_key_path.display()
)
);
}
@@ -997,15 +878,12 @@ impl PageServerConf {
}
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
pub fn test_repo_dir(test_name: &str) -> PathBuf {
PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
let repo_dir = PageserverConfWorkdir { d: repo_dir };
let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
PageServerConf {
id: NodeId(0),
@@ -1039,8 +917,6 @@ impl PageServerConf {
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
}
}
}
@@ -1173,8 +1049,8 @@ mod tests {
num::{NonZeroU32, NonZeroUsize},
};
use camino_tempfile::{tempdir, Utf8TempDir};
use remote_storage::{RemoteStorageKind, S3Config};
use tempfile::{tempdir, TempDir};
use utils::serde_percent::Percent;
use super::*;
@@ -1213,7 +1089,8 @@ background_task_maximum_delay = '334 s'
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
// we have to create dummy values to overcome the validation errors
let config_string = format!(
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
"pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display()
);
let toml = config_string.parse()?;
@@ -1263,9 +1140,7 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false
control_plane_api: None
},
"Correct defaults should be used when no config values are provided"
);
@@ -1280,7 +1155,8 @@ background_task_maximum_delay = '334 s'
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
let config_string = format!(
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display()
);
let toml = config_string.parse()?;
@@ -1320,9 +1196,7 @@ background_task_maximum_delay = '334 s'
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false
control_plane_api: None
},
"Should be able to parse all basic config values correctly"
);
@@ -1341,18 +1215,23 @@ background_task_maximum_delay = '334 s'
let identical_toml_declarations = &[
format!(
r#"[remote_storage]
local_path = '{local_storage_path}'"#,
local_path = '{}'"#,
local_storage_path.display()
),
format!(
"remote_storage={{local_path='{}'}}",
local_storage_path.display()
),
format!("remote_storage={{local_path='{local_storage_path}'}}"),
];
for remote_storage_config_str in identical_toml_declarations {
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{pg_distrib_dir}'
pg_distrib_dir='{}'
broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#,
pg_distrib_dir.display(),
);
let toml = config_string.parse()?;
@@ -1415,10 +1294,11 @@ concurrency_limit = {s3_concurrency_limit}"#
for remote_storage_config_str in identical_toml_declarations {
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{pg_distrib_dir}'
pg_distrib_dir='{}'
broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#,
pg_distrib_dir.display(),
);
let toml = config_string.parse()?;
@@ -1460,11 +1340,12 @@ broker_endpoint = '{broker_endpoint}'
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{pg_distrib_dir}'
pg_distrib_dir='{}'
broker_endpoint = '{broker_endpoint}'
[tenant_config]
trace_read_requests = {trace_read_requests}"#,
pg_distrib_dir.display(),
);
let toml = config_string.parse()?;
@@ -1484,7 +1365,7 @@ trace_read_requests = {trace_read_requests}"#,
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let pageserver_conf_toml = format!(
r#"pg_distrib_dir = "{pg_distrib_dir}"
r#"pg_distrib_dir = "{}"
metric_collection_endpoint = "http://sample.url"
metric_collection_interval = "10min"
id = 222
@@ -1502,6 +1383,7 @@ kind = "LayerAccessThreshold"
period = "20m"
threshold = "20m"
"#,
pg_distrib_dir.display(),
);
let toml: Document = pageserver_conf_toml.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
@@ -1542,12 +1424,10 @@ threshold = "20m"
Ok(())
}
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(PageserverConfWorkdir, Utf8PathBuf)> {
fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
let tempdir_path = tempdir.path();
let workdir = PageserverConfWorkdir {
d: tempdir_path.join("workdir"),
};
let workdir = tempdir_path.join("workdir");
fs::create_dir_all(&workdir)?;
let pg_distrib_dir = tempdir_path.join("pg_distrib");

View File

@@ -3,11 +3,11 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
use reqwest::Url;
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tracing::*;
@@ -41,7 +41,7 @@ pub async fn collect_metrics(
_cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: Utf8PathBuf,
local_disk_storage: PathBuf,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
@@ -68,7 +68,7 @@ pub async fn collect_metrics(
},
);
let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);
let path: Arc<PathBuf> = Arc::new(local_disk_storage);
let cancel = task_mgr::shutdown_token();
@@ -153,7 +153,7 @@ pub async fn collect_metrics(
///
/// Cancellation safe.
async fn restore_and_reschedule(
path: &Arc<Utf8PathBuf>,
path: &Arc<PathBuf>,
metric_collection_interval: Duration,
) -> Cache {
let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {

View File

@@ -1,12 +1,10 @@
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use std::path::PathBuf;
use std::sync::Arc;
use super::RawMetric;
pub(super) async fn read_metrics_from_disk(
path: Arc<Utf8PathBuf>,
) -> anyhow::Result<Vec<RawMetric>> {
pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
// do not add context to each error, callsite will log with full path
let span = tracing::Span::current();
tokio::task::spawn_blocking(move || {
@@ -27,10 +25,10 @@ pub(super) async fn read_metrics_from_disk(
.and_then(|x| x)
}
fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
let prefix = path.file_name().expect("caller checked").to_string();
let prefix = path.file_name().expect("caller checked").to_string_lossy();
for entry in it {
let entry = entry?;
@@ -64,7 +62,7 @@ fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
pub(super) async fn flush_metrics_to_disk(
current_metrics: &Arc<Vec<RawMetric>>,
path: &Arc<Utf8PathBuf>,
path: &Arc<PathBuf>,
) -> anyhow::Result<()> {
use std::io::Write;
@@ -83,7 +81,7 @@ pub(super) async fn flush_metrics_to_disk(
let parent = path.parent().expect("existence checked");
let file_name = path.file_name().expect("existence checked");
let mut tempfile = camino_tempfile::Builder::new()
let mut tempfile = tempfile::Builder::new()
.prefix(file_name)
.suffix(".tmp")
.tempfile_in(parent)?;

View File

@@ -1,9 +1,7 @@
use std::collections::HashMap;
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
};
use serde::{de::DeserializeOwned, Serialize};
use hyper::StatusCode;
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{
@@ -14,34 +12,25 @@ use utils::{
use crate::config::PageServerConf;
// Backoffs when control plane requests do not succeed: compromise between reducing load
// on control plane, and retrying frequently when we are blocked on a control plane
// response to make progress.
const BACKOFF_INCREMENT: f64 = 0.1;
const BACKOFF_MAX: f64 = 10.0;
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
pub struct ControlPlaneClient {
pub(crate) struct ControlPlaneClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
cancel: CancellationToken,
}
/// Represent operations which internally retry on all errors other than
/// cancellation token firing: the only way they can fail is ShuttingDown.
pub enum RetryForeverError {
ShuttingDown,
}
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
@@ -53,78 +42,39 @@ impl ControlPlaneClient {
segs.pop_if_empty().push("");
}
let mut client = reqwest::ClientBuilder::new();
if let Some(jwt) = &conf.control_plane_api_token {
let mut headers = hyper::HeaderMap::new();
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
client = client.default_headers(headers);
}
let client = reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
Some(Self {
http_client: client.build().expect("Failed to construct HTTP client"),
http_client: client,
base_url: url,
node_id: conf.id,
cancel: cancel.clone(),
})
}
async fn retry_http_forever<R, T>(
async fn try_re_attach(
&self,
url: &url::Url,
request: R,
) -> Result<T, RetryForeverError>
where
R: Serialize,
T: DeserializeOwned,
{
#[derive(thiserror::Error, Debug)]
enum RemoteAttemptError {
#[error("shutdown")]
Shutdown,
#[error("remote: {0}")]
Remote(reqwest::Error),
}
match backoff::retry(
|| async {
let response = self
.http_client
.post(url.clone())
.json(&request)
.send()
.await
.map_err(RemoteAttemptError::Remote)?;
response
.error_for_status_ref()
.map_err(RemoteAttemptError::Remote)?;
response
.json::<T>()
.await
.map_err(RemoteAttemptError::Remote)
},
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
)
.await
{
Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
Err(RemoteAttemptError::Remote(_)) => {
panic!("We retry forever, this should never be reached");
url: Url,
request: &ReAttachRequest,
) -> anyhow::Result<ReAttachResponse> {
match self.http_client.post(url).json(request).send().await {
Err(e) => Err(anyhow::Error::from(e)),
Ok(r) => {
if r.status() == StatusCode::OK {
r.json::<ReAttachResponse>()
.await
.map_err(anyhow::Error::from)
} else {
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
}
}
Ok(r) => Ok(r),
}
}
}
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
/// Block until we get a successful response
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -133,51 +83,37 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
node_id: self.node_id,
};
fail::fail_point!("control-plane-client-re-attach");
let mut attempt = 0;
loop {
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
match result {
Ok(res) => {
tracing::info!(
"Received re-attach response with {} tenants",
res.tenants.len()
);
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
tracing::info!(
"Received re-attach response with {} tenants",
response.tenants.len()
);
Ok(response
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>())
}
/// Block until we get a successful response, or error out if we are shut down
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("validate")
.expect("Failed to build validate path");
let request = ValidateRequest {
tenants: tenants
.into_iter()
.map(|(id, gen)| ValidateRequestTenant {
id,
gen: gen
.into()
.expect("Generation should always be valid for a Tenant doing deletions"),
})
.collect(),
};
fail::fail_point!("control-plane-client-validate");
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
Ok(response
.tenants
.into_iter()
.map(|rt| (rt.id, rt.valid))
.collect())
return Ok(res
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>());
}
Err(e) => {
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
backoff::exponential_backoff(
attempt,
BACKOFF_INCREMENT,
BACKOFF_MAX,
&self.cancel,
)
.await;
if self.cancel.is_cancelled() {
return Err(anyhow::anyhow!("Shutting down"));
}
attempt += 1;
}
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,156 +0,0 @@
//! The deleter is the final stage in the deletion queue. It accumulates remote
//! paths to delete, and periodically executes them in batches of up to 1000
//! using the DeleteObjects request.
//!
//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
//! number of full-sized DeleteObjects requests, rather than a larger number of
//! smaller requests.
use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::MAX_KEYS_PER_DELETE;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
use tracing::warn;
use crate::metrics;
use super::DeletionQueueError;
use super::FlushOp;
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
pub(super) enum DeleterMessage {
Delete(Vec<RemotePath>),
Flush(FlushOp),
}
/// Non-persistent deletion queue, for coalescing multiple object deletes into
/// larger DeleteObjects requests.
pub(super) struct Deleter {
// Accumulate up to 1000 keys for the next deletion operation
accumulator: Vec<RemotePath>,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
remote_storage: GenericRemoteStorage,
}
impl Deleter {
pub(super) fn new(
remote_storage: GenericRemoteStorage,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
) -> Self {
Self {
remote_storage,
rx,
cancel,
accumulator: Vec::new(),
}
}
/// Wrap the remote `delete_objects` with a failpoint
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
self.remote_storage.delete_objects(&self.accumulator).await
}
/// Block until everything in accumulator has been executed
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some
// or all of the deleted objects were already gone.
metrics::DELETION_QUEUE
.keys_executed
.inc_by(self.accumulator.len() as u64);
info!(
"Executed deletion batch {}..{}",
self.accumulator
.first()
.expect("accumulator should be non-empty"),
self.accumulator
.last()
.expect("accumulator should be non-empty"),
);
self.accumulator.clear();
}
Err(e) => {
warn!("DeleteObjects request failed: {e:#}, will retry");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["execute"])
.inc();
}
};
}
if self.cancel.is_cancelled() {
// Expose an error because we may not have actually flushed everything
Err(DeletionQueueError::ShuttingDown)
} else {
Ok(())
}
}
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
loop {
if self.cancel.is_cancelled() {
return Err(DeletionQueueError::ShuttingDown);
}
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
return Err(DeletionQueueError::ShuttingDown);
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending
self.flush().await?;
continue;
}
};
match msg {
DeleterMessage::Delete(mut list) => {
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
self.flush().await?;
// If we have received this number of keys, proceed with attempting to execute
assert_eq!(self.accumulator.len(), 0);
}
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
let take_count = std::cmp::min(available_slots, list.len());
for path in list.drain(list.len() - take_count..) {
self.accumulator.push(path);
}
}
}
DeleterMessage::Flush(flush_op) => {
// If flush() errors, we drop the flush_op and the caller will get
// an error recv()'ing their oneshot channel.
self.flush().await?;
flush_op.notify();
}
}
}
}
}

View File

@@ -1,479 +0,0 @@
//! The list writer is the first stage in the deletion queue. It accumulates
//! layers to delete, and periodically writes out these layers into a persistent
//! DeletionList.
//!
//! The purpose of writing DeletionLists is to decouple the decision to
//! delete an object from the validation required to execute it: even if
//! validation is not possible, e.g. due to a control plane outage, we can
//! still persist our intent to delete an object, in a way that would
//! survive a restart.
//!
//! DeletionLists are passed onwards to the Validator.
use super::DeletionHeader;
use super::DeletionList;
use super::FlushOp;
use super::ValidatorQueueMessage;
use std::collections::HashMap;
use std::fs::create_dir_all;
use std::time::Duration;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
// of magnitude 1MB when we are under heavy delete load.
const DELETION_LIST_TARGET_SIZE: usize = 16384;
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
// which we might leak objects from not flushing a DeletionList after
// the objects are already unlinked from timeline metadata.
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
// more objects before doing the flush.
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) struct RecoverOp {
pub(super) attached_tenants: HashMap<TenantId, Generation>,
}
#[derive(Debug)]
pub(super) enum ListWriterQueueMessage {
Delete(DeletionOp),
// Wait until all prior deletions make it into a persistent DeletionList
Flush(FlushOp),
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
FlushExecute(FlushOp),
// Call once after re-attaching to control plane, to notify the deletion queue about
// latest attached generations & load any saved deletion lists from disk.
Recover(RecoverOp),
}
pub(super) struct ListWriter {
conf: &'static PageServerConf,
// Incoming frontend requests to delete some keys
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
// Outbound requests to the backend to execute deletion lists we have composed.
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
// The list we are currently building, contains a buffer of keys to delete
// and our next sequence number
pending: DeletionList,
// These FlushOps should notify the next time we flush
pending_flushes: Vec<FlushOp>,
// Worker loop is torn down when this fires.
cancel: CancellationToken,
// Safety guard to do recovery exactly once
recovered: bool,
}
impl ListWriter {
// Initially DeletionHeader.validated_sequence is zero. The place we start our
// sequence numbers must be higher than that.
const BASE_SEQUENCE: u64 = 1;
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
cancel: CancellationToken,
) -> Self {
Self {
pending: DeletionList::new(Self::BASE_SEQUENCE),
conf,
rx,
tx,
pending_flushes: Vec::new(),
cancel,
recovered: false,
}
}
/// Try to flush `list` to persistent storage
///
/// This does not return errors, because on failure to flush we do not lose
/// any state: flushing will be retried implicitly on the next deadline
async fn flush(&mut self) {
if self.pending.is_empty() {
for f in self.pending_flushes.drain(..) {
f.notify();
}
return;
}
match self.pending.save(self.conf).await {
Ok(_) => {
info!(sequence = self.pending.sequence, "Stored deletion list");
for f in self.pending_flushes.drain(..) {
f.notify();
}
// Take the list we've accumulated, replace it with a fresh list for the next sequence
let next_list = DeletionList::new(self.pending.sequence + 1);
let list = std::mem::replace(&mut self.pending, next_list);
if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
// This is allowed to fail: it will only happen if the backend worker is shut down,
// so we can just drop this on the floor.
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
}
}
Err(e) => {
metrics::DELETION_QUEUE.unexpected_errors.inc();
warn!(
sequence = self.pending.sequence,
"Failed to write deletion list, will retry later ({e:#})"
);
}
}
}
/// Load the header, to learn the sequence number up to which deletions
/// have been validated. We will apply validated=true to DeletionLists
/// <= this sequence when loading them.
///
/// It is not an error for the header to not exist: we return None, and
/// the caller should act as if validated_sequence is 0
async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
let header_path = self.conf.deletion_header_path();
match tokio::fs::read(&header_path).await {
Ok(header_bytes) => {
match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
Ok(h) => Ok(Some(h.validated_sequence)),
Err(e) => {
warn!(
"Failed to deserialize deletion header, ignoring {header_path}: {e:#}",
);
// This should never happen unless we make a mistake with our serialization.
// Ignoring a deletion header is not consequential for correctnes because all deletions
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
metrics::DELETION_QUEUE.unexpected_errors.inc();
Ok(None)
}
}
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
debug!("Deletion header {header_path} not found, first start?");
Ok(None)
} else {
Err(anyhow::anyhow!(e))
}
}
}
}
async fn recover(
&mut self,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), anyhow::Error> {
debug!(
"recovering with {} attached tenants",
attached_tenants.len()
);
// Load the header
let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
self.pending.sequence = validated_sequence + 1;
let deletion_directory = self.conf.deletion_prefix();
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
Ok(d) => d,
Err(e) => {
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let list_name_pattern =
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
let temp_extension = format!(".{TEMP_SUFFIX}");
let header_path = self.conf.deletion_header_path();
let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await? {
let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy();
if file_name == header_path.file_name().unwrap_or("") {
// Don't try and parse the header's name like a list
continue;
}
if dentry_str.ends_with(&temp_extension) {
info!("Cleaning up temporary file {dentry_str}");
let absolute_path =
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
// Non-fatal error: we will just leave the file behind but not
// try and load it.
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
}
continue;
}
let file_name = dentry.file_name().to_owned();
let basename = file_name.to_string_lossy();
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
m.name("sequence")
.expect("Non optional group should be present")
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
};
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
Ok(s) => s,
Err(e) => {
warn!("Malformed key '{basename}': {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
seqs.push(seq);
}
seqs.sort();
// Start our next deletion list from after the last location validated by
// previous process lifetime, or after the last location found (it is updated
// below after enumerating the deletion lists)
self.pending.sequence = validated_sequence + 1;
if let Some(max_list_seq) = seqs.last() {
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
}
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,
Err(e) => {
// Drop the list on the floor: any objects it referenced will be left behind
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
if deletion_list.sequence <= validated_sequence {
// If the deletion list falls below valid_seq, we may assume that it was
// already validated the last time this pageserver ran. Otherwise, we still
// load it, as it may still contain content valid in this generation.
deletion_list.validated = true;
} else {
// Special case optimization: if a tenant is still attached, and no other
// generation was issued to another node in the interval while we restarted,
// then we may treat deletion lists from the previous generation as if they
// belong to our currently attached generation, and proceed to validate & execute.
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
if attached_gen.previous() == tenant_list.generation {
tenant_list.generation = *attached_gen;
}
}
}
}
info!(
validated = deletion_list.validated,
sequence = deletion_list.sequence,
"Recovered deletion list"
);
// We will drop out of recovery if this fails: it indicates that we are shutting down
// or the backend has panicked
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(deletion_list.len() as u64);
self.tx
.send(ValidatorQueueMessage::Delete(deletion_list))
.await?;
}
info!(next_sequence = self.pending.sequence, "Replay complete");
Ok(())
}
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
/// and write them out, for later validation by the backend and execution by the executor.
pub(super) async fn background(&mut self) {
info!("Started deletion frontend worker");
// Synchronous, but we only do it once per process lifetime so it's tolerable
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
tracing::error!(
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
self.conf.deletion_prefix(),
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
}
while !self.cancel.is_cancelled() {
let timeout = if self.pending_flushes.is_empty() {
FRONTEND_DEFAULT_TIMEOUT
} else {
FRONTEND_FLUSHING_TIMEOUT
};
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
Ok(Some(msg)) => msg,
Ok(None) => {
// Queue sender destroyed, shutting down
break;
}
Err(_) => {
// Hit deadline, flush.
self.flush().await;
continue;
}
};
match msg {
ListWriterQueueMessage::Delete(op) => {
assert!(
self.recovered,
"Cannot process deletions before recovery. This is a bug."
);
debug!(
"Delete: ingesting {} layers, {} other objects",
op.layers.len(),
op.objects.len()
);
let mut layer_paths = Vec::new();
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
&layer,
generation,
));
}
layer_paths.extend(op.objects);
if !self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) {
self.flush().await;
let retry_succeeded = self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
);
if !retry_succeeded {
// Unexpected: after we flush, we should have
// drained self.pending, so a conflict on
// generation numbers should be impossible.
tracing::error!(
"Failed to enqueue deletions, leaking objects. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
ListWriterQueueMessage::Flush(op) => {
if self.pending.is_empty() {
// Execute immediately
debug!("Flush: No pending objects, flushing immediately");
op.notify()
} else {
// Execute next time we flush
debug!("Flush: adding to pending flush list for next deadline flush");
self.pending_flushes.push(op);
}
}
ListWriterQueueMessage::FlushExecute(op) => {
debug!("FlushExecute: passing through to backend");
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
info!("Can't flush, shutting down ({e})");
// Caller will get error when their oneshot sender was dropped.
}
}
ListWriterQueueMessage::Recover(op) => {
if self.recovered {
tracing::error!(
"Deletion queue recovery called more than once. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
continue;
}
if let Err(e) = self.recover(op.attached_tenants).await {
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
// queue receiver has been dropped, or something is critically broken with
// the local filesystem holding deletion lists.
info!(
"Deletion queue recover aborted, deletion queue will not proceed ({e})"
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
} else {
self.recovered = true;
}
}
}
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
self.flush().await;
}
}
info!("Deletion queue shut down.");
}
}

View File

@@ -1,416 +0,0 @@
//! The validator is responsible for validating DeletionLists for execution,
//! based on whethe the generation in the DeletionList is still the latest
//! generation for a tenant.
//!
//! The purpose of validation is to ensure split-brain safety in the cluster
//! of pageservers: a deletion may only be executed if the tenant generation
//! that originated it is still current. See docs/rfcs/025-generation-numbers.md
//! The purpose of accumulating lists before validating them is to reduce load
//! on the control plane API by issuing fewer, larger requests.
//!
//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
//! to decide when old
//!
//! Deletions are passed onward to the Deleter.
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use camino::Utf8PathBuf;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::metrics;
use super::deleter::DeleterMessage;
use super::DeletionHeader;
use super::DeletionList;
use super::DeletionQueueError;
use super::FlushOp;
use super::VisibleLsnUpdates;
// After this length of time, do any validation work that is pending,
// even if we haven't accumulated many keys to delete.
//
// This also causes updates to remote_consistent_lsn to be validated, even
// if there were no deletions enqueued.
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
// If we have received this number of keys, proceed with attempting to execute
const AUTOFLUSH_KEY_COUNT: usize = 16384;
#[derive(Debug)]
pub(super) enum ValidatorQueueMessage {
Delete(DeletionList),
Flush(FlushOp),
}
pub(super) struct Validator<C>
where
C: ControlPlaneGenerationsApi,
{
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
// Client for calling into control plane API for validation of deletes
control_plane_client: Option<C>,
// DeletionLists which are waiting generation validation. Not safe to
// execute until [`validate`] has processed them.
pending_lists: Vec<DeletionList>,
// DeletionLists which have passed validation and are ready to execute.
validated_lists: Vec<DeletionList>,
// Sum of all the lengths of lists in pending_lists
pending_key_count: usize,
// Lsn validation state: we read projected LSNs and write back visible LSNs
// after validation. This is the LSN equivalent of `pending_validation_lists`:
// it is drained in [`validate`]
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
// If we failed to rewrite a deletion list due to local filesystem I/O failure,
// we must remember that and refuse to advance our persistent validated sequence
// number past the failure.
list_write_failed: Option<u64>,
cancel: CancellationToken,
}
impl<C> Validator<C>
where
C: ControlPlaneGenerationsApi,
{
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
control_plane_client: Option<C>,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
cancel: CancellationToken,
) -> Self {
Self {
conf,
rx,
tx,
control_plane_client,
lsn_table,
pending_lists: Vec::new(),
validated_lists: Vec::new(),
pending_key_count: 0,
list_write_failed: None,
cancel,
}
}
/// Process any outstanding validations of generations of pending LSN updates or pending
/// DeletionLists.
///
/// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
/// go into the queue of ready-to-execute lists.
async fn validate(&mut self) -> Result<(), DeletionQueueError> {
let mut tenant_generations = HashMap::new();
for list in &self.pending_lists {
for (tenant_id, tenant_list) in &list.tenants {
// Note: DeletionLists are in logical time order, so generation always
// goes up. By doing a simple insert() we will always end up with
// the latest generation seen for a tenant.
tenant_generations.insert(*tenant_id, tenant_list.generation);
}
}
let pending_lsn_updates = {
let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
std::mem::take(&mut *lsn_table)
};
for (tenant_id, update) in &pending_lsn_updates.tenants {
let entry = tenant_generations
.entry(*tenant_id)
.or_insert(update.generation);
if update.generation > *entry {
*entry = update.generation;
}
}
if tenant_generations.is_empty() {
// No work to do
return Ok(());
}
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
match control_plane_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
// The only way a validation call returns an error is when the cancellation token fires
return Err(DeletionQueueError::ShuttingDown);
}
}
} else {
// Control plane API disabled. In legacy mode we consider everything valid.
tenant_generations.keys().map(|k| (*k, true)).collect()
};
let mut validated_sequence: Option<u64> = None;
// Apply the validation results to the pending LSN updates
for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
let validated_generation = tenant_generations
.get(&tenant_id)
.expect("Map was built from the same keys we're reading");
let valid = tenants_valid
.get(&tenant_id)
.copied()
// If the tenant was missing from the validation response, it has been deleted.
// The Timeline that requested the LSN update is probably already torn down,
// or will be torn down soon. In this case, drop the update by setting valid=false.
.unwrap_or(false);
if valid && *validated_generation == tenant_lsn_state.generation {
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
pending_lsn.result_slot.store(pending_lsn.projected);
}
} else {
// If we failed validation, then do not apply any of the projected updates
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
// Apply the validation results to the pending deletion lists
for list in &mut self.pending_lists {
// Filter the list based on whether the server responded valid: true.
// If a tenant is omitted in the response, it has been deleted, and we should
// proceed with deletion.
let mut mutated = false;
list.tenants.retain(|tenant_id, tenant| {
let validated_generation = tenant_generations
.get(tenant_id)
.expect("Map was built from the same keys we're reading");
// If the tenant was missing from the validation response, it has been deleted.
// This means that a deletion is valid, but also redundant since the tenant's
// objects should have already been deleted. Treat it as invalid to drop the
// redundant deletion.
let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
// A list is valid if it comes from the current _or previous_ generation.
// - The previous generation case is permitted due to how we store deletion lists locally:
// if we see the immediately previous generation in a locally stored deletion list,
// it proves that this node's disk was used for both current & previous generations,
// and therefore no other node was involved in between: the two generations may be
// logically treated as the same.
// - In that previous generation case, we rewrote it to the current generation
// in recover(), so the comparison here is simply an equality.
let this_list_valid = valid
&& (tenant.generation == *validated_generation);
if !this_list_valid {
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
} else {
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
}
this_list_valid
});
list.validated = true;
if mutated {
// Save the deletion list if we had to make changes due to stale generations. The
// saved list is valid for execution.
if let Err(e) = list.save(self.conf).await {
// Highly unexpected. Could happen if e.g. disk full.
// If we didn't save the trimmed list, it is _not_ valid to execute.
warn!("Failed to save modified deletion list {list}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Rather than have a complex retry process, just drop it and leak the objects,
// scrubber will clean up eventually.
list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
// We must remember this failure, to prevent later writing out a header that
// would imply the unwritable list was valid on disk.
if self.list_write_failed.is_none() {
self.list_write_failed = Some(list.sequence);
}
}
}
validated_sequence = Some(list.sequence);
}
if let Some(validated_sequence) = validated_sequence {
if let Some(list_write_failed) = self.list_write_failed {
// Rare error case: we failed to write out a deletion list to excise invalid
// entries, so we cannot advance the header's valid sequence number past that point.
//
// In this state we will continue to validate, execute and delete deletion lists,
// we just cannot update the header. It should be noticed and fixed by a human due to
// the nonzero value of our unexpected_errors metric.
warn!(
sequence_number = list_write_failed,
"Cannot write header because writing a deletion list failed earlier",
);
} else {
// Write the queue header to record how far validation progressed. This avoids having
// to rewrite each DeletionList to set validated=true in it.
let header = DeletionHeader::new(validated_sequence);
// Drop result because the validated_sequence is an optimization. If we fail to save it,
// then restart, we will drop some deletion lists, creating work for scrubber.
// The save() function logs a warning on error.
if let Err(e) = header.save(self.conf).await {
warn!("Failed to write deletion queue header: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
// Transfer the validated lists to the validated queue, for eventual execution
self.validated_lists.append(&mut self.pending_lists);
Ok(())
}
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
for list_path in list_paths {
debug!("Removing deletion list {list_path}");
if let Err(e) = tokio::fs::remove_file(&list_path).await {
// Unexpected: we should have permissions and nothing else should
// be touching these files. We will leave the file behind. Subsequent
// pageservers will try and load it again: hopefully whatever storage
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {list_path}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
break;
}
}
}
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
// Issue any required generation validation calls to the control plane
self.validate().await?;
// After successful validation, nothing is pending: any lists that
// made it through validation will be in validated_lists.
assert!(self.pending_lists.is_empty());
self.pending_key_count = 0;
tracing::debug!(
"Validation complete, have {} validated lists",
self.validated_lists.len()
);
// Return quickly if we have no validated lists to execute. This avoids flushing the
// executor when an idle backend hits its autoflush interval
if self.validated_lists.is_empty() {
return Ok(());
}
// Drain `validated_lists` into the executor
let mut executing_lists = Vec::new();
for list in self.validated_lists.drain(..) {
let list_path = self.conf.deletion_list_path(list.sequence);
let objects = list.into_remote_paths();
self.tx
.send(DeleterMessage::Delete(objects))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
executing_lists.push(list_path);
}
self.flush_executor().await?;
// Erase the deletion lists whose keys have all be deleted from remote storage
self.cleanup_lists(executing_lists).await;
Ok(())
}
async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
// Flush the executor, so that all the keys referenced by these deletion lists
// are actually removed from remote storage. This is a precondition to deleting
// the deletion lists themselves.
let (flush_op, rx) = FlushOp::new();
self.tx
.send(DeleterMessage::Flush(flush_op))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
}
pub(super) async fn background(&mut self) {
tracing::info!("Started deletion backend worker");
while !self.cancel.is_cancelled() {
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
break;
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending.
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
continue;
}
};
match msg {
ValidatorQueueMessage::Delete(list) => {
if list.validated {
// A pre-validated list may only be seen during recovery, if we are recovering
// a DeletionList whose on-disk state has validated=true
self.validated_lists.push(list)
} else {
self.pending_key_count += list.len();
self.pending_lists.push(list);
}
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
}
}
ValidatorQueueMessage::Flush(op) => {
match self.flush().await {
Ok(()) => {
op.notify();
}
Err(DeletionQueueError::ShuttingDown) => {
// If we fail due to shutting down, we will just drop `op` to propagate that status.
}
}
}
}
}
}
}

View File

@@ -43,12 +43,12 @@
use std::{
collections::HashMap,
path::Path,
sync::Arc,
time::{Duration, SystemTime},
};
use anyhow::Context;
use camino::Utf8Path;
use remote_storage::GenericRemoteStorage;
use serde::{Deserialize, Serialize};
use tokio::time::Instant;
@@ -122,7 +122,7 @@ async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: GenericRemoteStorage,
tenants_dir: &Utf8Path,
tenants_dir: &Path,
cancel: CancellationToken,
) {
scopeguard::defer! {
@@ -184,7 +184,7 @@ async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
tenants_dir: &Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
@@ -620,8 +620,9 @@ impl std::ops::Deref for TimelineKey {
}
mod filesystem_level_usage {
use std::path::Path;
use anyhow::Context;
use camino::Utf8Path;
use crate::statvfs::Statvfs;
@@ -663,7 +664,7 @@ mod filesystem_level_usage {
}
pub fn get<'a>(
tenants_dir: &Utf8Path,
tenants_dir: &Path,
config: &'a DiskUsageEvictionTaskConfig,
) -> anyhow::Result<Usage<'a>> {
let mock_config = {

View File

@@ -93,16 +93,9 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete:
description: |
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished"
responses:
"400":
@@ -141,13 +134,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline:
parameters:
@@ -192,13 +178,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
parameters:
@@ -247,13 +226,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete:
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
responses:
@@ -293,19 +265,13 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
parameters:
@@ -362,13 +328,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters:
- name: tenant_id
@@ -416,13 +375,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/attach:
parameters:
- name: tenant_id
@@ -513,13 +465,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/detach:
parameters:
@@ -573,13 +518,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/ignore:
parameters:
@@ -622,13 +560,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/load:
parameters:
@@ -673,13 +604,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
@@ -717,12 +641,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/size:
parameters:
@@ -786,13 +704,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/:
parameters:
@@ -869,13 +780,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/:
get:
description: Get tenants list
@@ -906,13 +810,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
post:
description: |
Create a tenant. Returns new tenant id on success.
@@ -963,13 +860,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/config:
put:
@@ -1015,13 +905,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/config/:
parameters:
- name: tenant_id
@@ -1071,13 +954,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
components:
securitySchemes:
JWT:
@@ -1217,9 +1093,6 @@ components:
remote_consistent_lsn:
type: string
format: hex
remote_consistent_lsn_visible:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex
@@ -1344,13 +1217,6 @@ components:
properties:
msg:
type: string
ServiceUnavailableError:
type: object
required:
- msg
properties:
msg:
type: string
NotFoundError:
type: object
required:

View File

@@ -5,14 +5,11 @@ use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt;
use hyper::header::CONTENT_TYPE;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLoadRequest, TenantLocationConfigRequest,
DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
};
use remote_storage::GenericRemoteStorage;
use tenant_size_model::{SizeResult, StorageModel};
@@ -27,18 +24,17 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::Timeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
@@ -65,7 +61,6 @@ pub struct State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
}
impl State {
@@ -75,7 +70,6 @@ impl State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
@@ -88,17 +82,8 @@ impl State {
remote_storage,
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
})
}
fn tenant_resources(&self) -> TenantSharedResources {
TenantSharedResources {
broker_client: self.broker_client.clone(),
remote_storage: self.remote_storage.clone(),
deletion_queue_client: self.deletion_queue_client.clone(),
}
}
}
#[inline(always)]
@@ -134,7 +119,7 @@ impl From<PageReconstructError> for ApiError {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}").into())
ApiError::InternalServerError(anyhow::Error::new(pre))
}
PageReconstructError::WalRedo(pre) => {
ApiError::InternalServerError(anyhow::Error::new(pre))
@@ -147,15 +132,12 @@ impl From<TenantMapInsertError> for ApiError {
fn from(tmie: TenantMapInsertError) -> ApiError {
match tmie {
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
ApiError::ResourceUnavailable(format!("{tmie}").into())
ApiError::InternalServerError(anyhow::Error::new(tmie))
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
}
TenantMapInsertError::TenantExistsSecondary(id) => {
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
}
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
}
}
}
@@ -164,12 +146,6 @@ impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError {
match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
TenantStateError::NotActive(_) => {
ApiError::ResourceUnavailable("Tenant not yet active".into())
}
TenantStateError::IsStopping(_) => {
ApiError::ResourceUnavailable("Tenant is stopping".into())
}
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
}
}
@@ -179,17 +155,14 @@ impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
GetTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetTenantError::NotActive(_) => {
e @ GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
// in fact exist locally. If we did, the caller could draw the conclusion
// that it can attach the tenant to another PS and we'd be in split-brain.
//
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into())
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
@@ -310,12 +283,7 @@ async fn build_timeline_info_common(
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
let remote_consistent_lsn_visible = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
let walreceiver_status = timeline.walreceiver_status();
@@ -325,8 +293,7 @@ async fn build_timeline_info_common(
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
remote_consistent_lsn: remote_consistent_lsn_projected,
remote_consistent_lsn_visible,
remote_consistent_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -396,9 +363,6 @@ async fn timeline_create_handler(
format!("{err:#}")
))
}
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
}
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
}
}
@@ -528,23 +492,24 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
if state.remote_storage.is_none() {
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.broker_client.clone(),
remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -605,7 +570,6 @@ async fn tenant_load_handler(
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.instrument(info_span!("load", %tenant_id))
@@ -639,9 +603,8 @@ async fn tenant_list_handler(
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
.map_err(|_| {
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
})?
.map_err(anyhow::Error::new)
.map_err(ApiError::InternalServerError)?
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
@@ -948,7 +911,8 @@ async fn tenant_create_handler(
tenant_conf,
target_tenant_id,
generation,
state.tenant_resources(),
state.broker_client.clone(),
state.remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1019,48 +983,6 @@ async fn update_tenant_config_handler(
json_response(StatusCode::OK, ())
}
async fn put_tenant_location_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let state = get_state(&request);
let conf = state.conf;
// The `Detached` state is special, it doesn't upsert a tenant, it removes
// its local disk content and drops it from memory.
if let LocationConfigMode::Detached = request_data.config.mode {
mgr::detach_tenant(conf, tenant_id, true)
.instrument(info_span!("tenant_detach", %tenant_id))
.await?;
return json_response(StatusCode::OK, ());
}
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
mgr::upsert_location(
state.conf,
tenant_id,
location_conf,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
json_response(StatusCode::OK, ())
}
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
async fn handle_tenant_break(
r: Request<Body>,
@@ -1207,169 +1129,6 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let flush = async {
if execute {
state.deletion_queue_client.flush_execute().await
} else {
state.deletion_queue_client.flush().await
}
}
// DeletionQueueError's only case is shutting down.
.map_err(|_| ApiError::ShuttingDown);
tokio::select! {
res = flush => {
res.map(|()| json_response(StatusCode::OK, ()))?
}
_ = cancel.cancelled() => {
Err(ApiError::ShuttingDown)
}
}
}
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
async fn getpage_at_lsn_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
struct Key(crate::repository::Key);
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
crate::repository::Key::from_hex(s).map(Key)
}
}
let key: Key = parse_query_param(&request, "key")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let page = timeline.get(key.0, lsn, &ctx).await?;
Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
}
.instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
.await
}
async fn timeline_collect_keyspace(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
struct Partitioning {
keys: crate::keyspace::KeySpace,
at_lsn: Lsn,
}
impl serde::Serialize for Partitioning {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeMap;
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
}
}
struct WithDisplay<'a, T>(&'a T);
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(&self.0)
}
}
struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
impl<'a> serde::Serialize for KeySpace<'a> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
for kr in &self.0.ranges {
seq.serialize_element(&KeyRange(kr))?;
}
seq.end()
}
}
struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
impl<'a> serde::Serialize for KeyRange<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeTuple;
let mut t = serializer.serialize_tuple(2)?;
t.serialize_element(&WithDisplay(&self.0.start))?;
t.serialize_element(&WithDisplay(&self.0.end))?;
t.end()
}
}
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
}
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
.await
}
async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -1644,9 +1403,6 @@ pub fn make_router(
.get("/v1/tenant/:tenant_id/config", |r| {
api_handler(r, get_tenant_config_handler)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
api_handler(r, put_tenant_location_config_handler)
})
.get("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_list_handler)
})
@@ -1707,9 +1463,6 @@ pub fn make_router(
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
@@ -1717,12 +1470,5 @@ pub fn make_router(
.post("/v1/tracing/event", |r| {
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
})
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
)
.any(handler_404))
}

View File

@@ -6,7 +6,6 @@ use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
@@ -30,7 +29,7 @@ use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;
// Returns checkpoint LSN from controlfile
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
@@ -47,7 +46,7 @@ pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
/// cluster was not shut down cleanly.
pub async fn import_timeline_from_postgres_datadir(
tline: &Timeline,
pgdata_path: &Utf8Path,
pgdata_path: &Path,
pgdata_lsn: Lsn,
ctx: &RequestContext,
) -> Result<()> {
@@ -257,7 +256,7 @@ async fn import_slru(
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
async fn import_wal(
walpath: &Utf8Path,
walpath: &Path,
tline: &Timeline,
startpoint: Lsn,
endpoint: Lsn,

View File

@@ -3,8 +3,7 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
pub mod control_plane_client;
pub mod deletion_queue;
mod control_plane_client;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
@@ -25,9 +24,9 @@ pub mod walredo;
pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tracing::info;
/// Current storage format version
@@ -49,8 +48,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
#[tracing::instrument]
pub async fn shutdown_pageserver(exit_code: i32) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -78,11 +77,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -112,10 +106,6 @@ pub const METADATA_FILE_NAME: &str = "metadata";
/// Full path: `tenants/<tenant_id>/config`.
pub const TENANT_CONFIG_NAME: &str = "config";
/// Per-tenant configuration file.
/// Full path: `tenants/<tenant_id>/config`.
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
/// A suffix used for various temporary files. Any temporary files found in the
/// data directory at pageserver startup can be automatically removed.
pub const TEMP_FILE_SUFFIX: &str = "___temp";
@@ -135,25 +125,25 @@ pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
pub fn is_temporary(path: &Utf8Path) -> bool {
pub fn is_temporary(path: &Path) -> bool {
match path.file_name() {
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
None => false,
}
}
fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
match path.file_name() {
Some(name) => name.ends_with(suffix),
Some(name) => name.to_string_lossy().ends_with(suffix),
None => false,
}
}
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
pub fn is_uninit_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
}
pub fn is_delete_mark(path: &Utf8Path) -> bool {
pub fn is_delete_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
}

View File

@@ -94,35 +94,15 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
pub(crate) struct ReconstructTimeMetrics {
ok: Histogram,
err: Histogram,
}
pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
let inner = register_histogram_vec!(
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
&["result"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
ReconstructTimeMetrics {
ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
err: inner.get_metric_with_label_values(&["err"]).unwrap(),
}
.expect("failed to define a metric")
});
impl ReconstructTimeMetrics {
pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
match result {
Ok(_) => &self.ok,
Err(_) => &self.err,
}
}
}
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_direct_total",
@@ -284,46 +264,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
"Time spent acquiring a pinned slot in the page cache",
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
"Number of timeouts while acquiring a pinned slot in the page cache",
&["error_kind"]
)
.expect("failed to define a metric")
});
#[derive(IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum PageCacheErrorKind {
AcquirePinnedSlotTimeout,
EvictIterLimit,
}
pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
PAGE_CACHE_ERRORS
.get_metric_with_label_values(&[error_kind.into()])
.unwrap()
.inc();
}
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wait_lsn_seconds",
@@ -351,14 +291,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_resident_physical_size_global",
"Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
@@ -369,14 +301,6 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_remote_physical_size_global",
"Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
@@ -691,9 +615,10 @@ impl StorageIoTime {
.expect("failed to define a metric");
let metrics = std::array::from_fn(|i| {
let op = StorageIoOperation::from_repr(i).unwrap();
storage_io_histogram_vec
let metric = storage_io_histogram_vec
.get_metric_with_label_values(&[op.as_str()])
.unwrap()
.unwrap();
metric
});
Self { metrics }
}
@@ -962,61 +887,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
.expect("failed to define a metric")
});
pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter,
pub(crate) keys_executed: IntCounter,
pub(crate) keys_validated: IntCounter,
pub(crate) dropped_lsn_updates: IntCounter,
pub(crate) unexpected_errors: IntCounter,
pub(crate) remote_errors: IntCounterVec,
}
pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
DeletionQueueMetrics{
keys_submitted: register_int_counter!(
"pageserver_deletion_queue_submitted_total",
"Number of objects submitted for deletion"
)
.expect("failed to define a metric"),
keys_dropped: register_int_counter!(
"pageserver_deletion_queue_dropped_total",
"Number of object deletions dropped due to stale generation."
)
.expect("failed to define a metric"),
keys_executed: register_int_counter!(
"pageserver_deletion_queue_executed_total",
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
)
.expect("failed to define a metric"),
keys_validated: register_int_counter!(
"pageserver_deletion_queue_validated_total",
"Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
)
.expect("failed to define a metric"),
dropped_lsn_updates: register_int_counter!(
"pageserver_deletion_queue_dropped_lsn_updates_total",
"Updates to remote_consistent_lsn dropped due to stale generation number."
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_deletion_queue_unexpected_errors_total",
"Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
)
.expect("failed to define a metric"),
remote_errors: register_int_counter_vec!(
"pageserver_deletion_queue_remote_errors_total",
"Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
&["op_kind"],
)
.expect("failed to define a metric")
}
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1291,7 +1161,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
resident_physical_size_gauge: UIntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -1369,29 +1239,10 @@ impl TimelineMetrics {
}
pub fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_add(sz);
self.resident_physical_size_gauge.add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}
pub fn resident_physical_size_sub(&self, sz: u64) {
self.resident_physical_size_gauge.sub(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
}
pub fn resident_physical_size_add(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
}
pub fn resident_physical_size_set(&self, sz: u64) {
self.resident_physical_size_gauge.set(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
}
pub fn resident_physical_size_get(&self) -> u64 {
self.resident_physical_size_gauge.get()
}
}
impl Drop for TimelineMetrics {
@@ -1399,10 +1250,7 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1456,43 +1304,10 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
/// Maintain a per timeline gauge in addition to the global gauge.
struct PerTimelineRemotePhysicalSizeGauge {
last_set: u64,
gauge: UIntGauge,
}
impl PerTimelineRemotePhysicalSizeGauge {
fn new(per_timeline_gauge: UIntGauge) -> Self {
Self {
last_set: per_timeline_gauge.get(),
gauge: per_timeline_gauge,
}
}
fn set(&mut self, sz: u64) {
self.gauge.set(sz);
if sz < self.last_set {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
} else {
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
};
self.last_set = sz;
}
fn get(&self) -> u64 {
self.gauge.get()
}
}
impl Drop for PerTimelineRemotePhysicalSizeGauge {
fn drop(&mut self) {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
}
}
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1510,24 +1325,18 @@ impl RemoteTimelineClientMetrics {
}
}
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
let gauge = guard.get_or_insert_with(|| {
PerTimelineRemotePhysicalSizeGauge::new(
guard
.get_or_insert_with(|| {
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap(),
)
});
gauge.set(sz);
}
pub(crate) fn remote_physical_size_get(&self) -> u64 {
let guard = self.remote_physical_size_gauge.lock().unwrap();
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
.unwrap()
})
.clone()
}
pub fn remote_operation_time(
@@ -1866,9 +1675,6 @@ pub fn preinitialize_metrics() {
Lazy::force(c);
});
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
@@ -1882,6 +1688,7 @@ pub fn preinitialize_metrics() {
// histograms
[
&READ_NUM_FS_LAYERS,
&RECONSTRUCT_TIME,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
@@ -1892,7 +1699,4 @@ pub fn preinitialize_metrics() {
.for_each(|h| {
Lazy::force(h);
});
// Custom
Lazy::force(&RECONSTRUCT_TIME);
}

View File

@@ -66,7 +66,8 @@
//! inserted to the mapping, but you must hold the write-lock on the slot until
//! the contents are valid. If you need to release the lock without initializing
//! the contents, you must remove the mapping first. We make that easy for the
//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
//! page, the caller must explicitly call guard.mark_valid() after it has
//! initialized it. If the guard is dropped without calling mark_valid(), the
//! mapping is automatically removed and the slot is marked free.
//!
@@ -74,11 +75,7 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
Arc, Weak,
},
time::Duration,
sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
};
use anyhow::Context;
@@ -168,8 +165,6 @@ struct Slot {
struct SlotInner {
key: Option<CacheKey>,
// for `coalesce_readers_permit`
permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
buf: &'static mut [u8; PAGE_SZ],
}
@@ -212,22 +207,6 @@ impl Slot {
}
}
impl SlotInner {
/// If there is aready a reader, drop our permit and share its permit, just like we share read access.
fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
let mut guard = self.permit.lock().unwrap();
if let Some(existing_permit) = guard.upgrade() {
drop(guard);
drop(permit);
existing_permit
} else {
let permit = Arc::new(permit);
*guard = Arc::downgrade(&permit);
permit
}
}
}
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
@@ -245,8 +224,6 @@ pub struct PageCache {
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
pinned_slots: Arc<tokio::sync::Semaphore>,
/// Index of the next candidate to evict, for the Clock replacement algorithm.
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
@@ -254,28 +231,23 @@ pub struct PageCache {
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
/// until the guard is dropped.
///
pub struct PageReadGuard<'i> {
_permit: Arc<PinnedSlotsPermit>,
slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
}
pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
self.slot_guard.buf
self.0.buf
}
}
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
fn as_ref(&self) -> &[u8; PAGE_SZ] {
self.slot_guard.buf
self.0.buf
}
}
@@ -285,25 +257,21 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
///
/// Counterintuitively, this is used even for a read, if the requested page is not
/// currently found in the page cache. In that case, the caller of lock_for_read()
/// is expected to fill in the page contents and call mark_valid().
/// is expected to fill in the page contents and call mark_valid(). Similarly
/// lock_for_write() can return an invalid buffer that the caller is expected to
/// to initialize.
///
pub struct PageWriteGuard<'i> {
state: PageWriteGuardState<'i>,
}
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
enum PageWriteGuardState<'i> {
Invalid {
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
_permit: PinnedSlotsPermit,
},
Downgraded,
// Are the page contents currently valid?
// Used to mark pages as invalid that are assigned but not yet filled with data.
valid: bool,
}
impl std::ops::DerefMut for PageWriteGuard<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
self.inner.buf
}
}
@@ -311,37 +279,25 @@ impl std::ops::Deref for PageWriteGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
match &self.state {
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
self.inner.buf
}
}
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
self.inner.buf
}
}
impl<'a> PageWriteGuard<'a> {
impl PageWriteGuard<'_> {
/// Mark that the buffer contents are now valid.
#[must_use]
pub fn mark_valid(mut self) -> PageReadGuard<'a> {
let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
match prev {
PageWriteGuardState::Invalid { inner, _permit } => {
assert!(inner.key.is_some());
PageReadGuard {
_permit: Arc::new(_permit),
slot_guard: inner.downgrade(),
}
}
PageWriteGuardState::Downgraded => unreachable!(),
}
pub fn mark_valid(&mut self) {
assert!(self.inner.key.is_some());
assert!(
!self.valid,
"mark_valid called on a buffer that was already valid"
);
self.valid = true;
}
}
@@ -352,14 +308,11 @@ impl Drop for PageWriteGuard<'_> {
/// initializing it, remove the mapping from the page cache.
///
fn drop(&mut self) {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => {
assert!(inner.key.is_some());
let self_key = inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
inner.key = None;
}
PageWriteGuardState::Downgraded => {}
assert!(self.inner.key.is_some());
if !self.valid {
let self_key = self.inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
self.inner.key = None;
}
}
}
@@ -370,6 +323,12 @@ pub enum ReadBufResult<'a> {
NotFound(PageWriteGuard<'a>),
}
/// lock_for_write() return value
pub enum WriteBufResult<'a> {
Found(PageWriteGuard<'a>),
NotFound(PageWriteGuard<'a>),
}
impl PageCache {
//
// Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -389,10 +348,6 @@ impl PageCache {
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
return None;
};
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
@@ -407,10 +362,7 @@ impl PageCache {
lsn,
};
if let Some(guard) = self
.try_lock_for_read(&mut cache_key, &mut Some(permit))
.await
{
if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
@@ -456,77 +408,20 @@ impl PageCache {
lsn,
};
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
loop {
// First check if the key already exists in the cache.
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(&cache_key) {
slot.inc_usage_count();
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
debug_assert_eq!(inner.buf.len(), img.len());
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
// replayed.
assert!(inner.buf == img);
return Ok(());
}
match self.lock_for_write(&cache_key).await? {
WriteBufResult::Found(write_guard) => {
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
// replayed.
assert!(*write_guard == img);
}
debug_assert!(permit.is_some());
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
// our victim buffer unnecessarily. Put it into the free list and
// continue with the slot that the other thread chose.
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
// TODO: put to free list
// We now just loop back to start from beginning. This is not
// optimal, we'll perform the lookup in the mapping again, which
// is not really necessary because we already got
// 'existing_slot_idx'. But this shouldn't happen often enough
// to matter much.
continue;
WriteBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(img);
write_guard.mark_valid();
}
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
// Create a write guard for the slot so we go through the expected motions.
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
let mut write_guard = PageWriteGuard {
state: PageWriteGuardState::Invalid {
_permit: permit.take().unwrap(),
inner,
},
};
write_guard.copy_from_slice(img);
let _ = write_guard.mark_valid();
return Ok(());
}
Ok(())
}
// Section 1.2: Public interface functions for working with immutable file pages.
@@ -550,29 +445,6 @@ impl PageCache {
// "mappings" after this section. But the routines in this section should
// not require changes.
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
match tokio::time::timeout(
// Choose small timeout, neon_smgr does its own retries.
// https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
Duration::from_secs(10),
Arc::clone(&self.pinned_slots).acquire_owned(),
)
.await
{
Ok(res) => Ok(PinnedSlotsPermit(
res.expect("this semaphore is never closed"),
)),
Err(_timeout) => {
timer.stop_and_discard();
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
);
anyhow::bail!("timeout: there were page guards alive for all page cache slots")
}
}
}
/// Look up a page in the cache.
///
/// If the search criteria is not exact, *cache_key is updated with the key
@@ -582,11 +454,7 @@ impl PageCache {
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
async fn try_lock_for_read(
&self,
cache_key: &mut CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageReadGuard> {
async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
@@ -596,10 +464,7 @@ impl PageCache {
let inner = slot.inner.read().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageReadGuard {
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
slot_guard: inner,
});
return Some(PageReadGuard(inner));
} else {
// search_mapping might have modified the search key; restore it.
*cache_key = cache_key_orig;
@@ -642,8 +507,6 @@ impl PageCache {
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
@@ -660,21 +523,17 @@ impl PageCache {
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
debug_assert!(permit.is_some());
is_first_iteration = false;
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -696,19 +555,70 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Ok(ReadBufResult::NotFound(PageWriteGuard {
state: PageWriteGuardState::Invalid {
_permit: permit.take().unwrap(),
inner,
},
inner,
valid: false,
}));
}
}
/// Look up a page in the cache and lock it in write mode. If it's not
/// found, returns None.
///
/// When locking a page for writing, the search criteria is always "exact".
async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageWriteGuard { inner, valid: true });
}
}
None
}
/// Return a write-locked buffer for given block.
///
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
loop {
// First check if the key already exists in the cache.
if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
return Ok(WriteBufResult::Found(write_guard));
}
// Not found. Find a victim buffer
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
// our victim buffer unnecessarily. Put it into the free list and
// continue with the slot that the other thread chose.
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
// TODO: put to free list
// We now just loop back to start from beginning. This is not
// optimal, we'll perform the lookup in the mapping again, which
// is not really necessary because we already got
// 'existing_slot_idx'. But this shouldn't happen often enough
// to matter much.
continue;
}
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
return Ok(WriteBufResult::NotFound(PageWriteGuard {
inner,
valid: false,
}));
}
}
@@ -753,7 +663,7 @@ impl PageCache {
///
/// Like 'search_mapping, but performs an "exact" search. Used for
/// allocating a new buffer.
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
match key {
CacheKey::MaterializedPage { hash_key, lsn } => {
let map = self.materialized_page_map.read().unwrap();
@@ -859,10 +769,7 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
async fn find_victim(
&self,
_permit_witness: &PinnedSlotsPermit,
) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
@@ -875,40 +782,13 @@ impl PageCache {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(_err) => {
// If we have looped through the whole buffer pool 10 times
// and still haven't found a victim buffer, something's wrong.
// Maybe all the buffers were in locked. That could happen in
// theory, if you have more threads holding buffers locked than
// there are buffers in the pool. In practice, with a reasonably
// large buffer pool it really shouldn't happen.
if iters > iter_limit {
// NB: Even with the permits, there's no hard guarantee that we will find a slot with
// any particular number of iterations: other threads might race ahead and acquire and
// release pins just as we're scanning the array.
//
// Imagine that nslots is 2, and as starting point, usage_count==1 on all
// slots. There are two threads running concurrently, A and B. A has just
// acquired the permit from the semaphore.
//
// A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
// B: Acquire permit.
// B: Look at slot 2, decrement its usage_count to zero and continue the search
// B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
// B: Release pin and permit again
// B: Acquire permit.
// B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
// B: Release pin and permit again
//
// Now we're back in the starting situation that both slots have
// usage_count 1, but A has now been through one iteration of the
// find_victim() loop. This can repeat indefinitely and on each
// iteration, A's iteration count increases by one.
//
// So, even though the semaphore for the permits is fair, the victim search
// itself happens in parallel and is not fair.
// Hence even with a permit, a task can theoretically be starved.
// To avoid this, we'd need tokio to give priority to tasks that are holding
// permits for longer.
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
);
anyhow::bail!("exceeded evict iter limit");
}
continue;
@@ -919,7 +799,6 @@ impl PageCache {
self.remove_mapping(old_key);
inner.key = None;
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}
@@ -947,11 +826,7 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: tokio::sync::RwLock::new(SlotInner {
key: None,
buf,
permit: std::sync::Mutex::new(Weak::new()),
}),
inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
usage_count: AtomicU8::new(0),
}
})
@@ -963,7 +838,6 @@ impl PageCache {
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
}

View File

@@ -35,7 +35,6 @@ use std::time::Duration;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::field;
use tracing::*;
use utils::id::ConnectionId;
@@ -65,6 +64,69 @@ use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
where
IO: AsyncRead + AsyncWrite + Unpin,
{
async_stream::try_stream! {
loop {
let msg = tokio::select! {
biased;
_ = task_mgr::shutdown_watcher() => {
// We were requested to shut down.
let msg = "pageserver is shutting down";
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
Err(QueryError::Other(anyhow::anyhow!(msg)))
}
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
};
match msg {
Ok(Some(message)) => {
let copy_data_bytes = match message {
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = "client terminated connection with Terminate message during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {m:?}");
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
break;
}
};
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
Err(io_error)?;
}
Err(other) => {
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
}
};
}
}
}
/// Read the end of a tar archive.
///
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -222,13 +284,7 @@ async fn page_service_conn_main(
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(
conf,
broker_client,
auth,
connection_ctx,
task_mgr::shutdown_token(),
);
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend
@@ -262,10 +318,6 @@ struct PageServerHandler {
/// For each query received over the connection,
/// `process_query` creates a child context from this one.
connection_ctx: RequestContext,
/// A token that should fire when the tenant transitions from
/// attached state, or when the pageserver is shutting down.
cancel: CancellationToken,
}
impl PageServerHandler {
@@ -274,7 +326,6 @@ impl PageServerHandler {
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
connection_ctx: RequestContext,
cancel: CancellationToken,
) -> Self {
PageServerHandler {
_conf: conf,
@@ -282,91 +333,6 @@ impl PageServerHandler {
auth,
claims: None,
connection_ctx,
cancel,
}
}
/// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
/// this rather than naked flush() in order to shut down promptly. Without this, we would
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
/// in the flush.
async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
tokio::select!(
flush_r = pgb.flush() => {
Ok(flush_r?)
},
_ = self.cancel.cancelled() => {
Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
}
)
}
fn copyin_stream<'a, IO>(
&'a self,
pgb: &'a mut PostgresBackend<IO>,
) -> impl Stream<Item = io::Result<Bytes>> + 'a
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
async_stream::try_stream! {
loop {
let msg = tokio::select! {
biased;
_ = task_mgr::shutdown_watcher() => {
// We were requested to shut down.
let msg = "pageserver is shutting down";
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
Err(QueryError::Other(anyhow::anyhow!(msg)))
}
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
};
match msg {
Ok(Some(message)) => {
let copy_data_bytes = match message {
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = "client terminated connection with Terminate message during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {m:?}");
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
break;
}
};
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
Err(io_error)?;
}
Err(other) => {
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
}
};
}
}
}
@@ -406,7 +372,7 @@ impl PageServerHandler {
// switch client to COPYBOTH
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
self.flush_cancellable(pgb).await?;
pgb.flush().await?;
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
@@ -446,60 +412,38 @@ impl PageServerHandler {
// TODO: We could create a new per-request context here, with unique ID.
// Currently we use the same per-timeline context for all requests
let (response, span) = match neon_fe_msg {
let response = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
.await
}
PagestreamFeMessage::Nblocks(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_nblocks_request(&timeline, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
}
PagestreamFeMessage::GetPage(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
(
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
.await
}
PagestreamFeMessage::DbSize(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
(
self.handle_db_size_request(&timeline, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
self.handle_db_size_request(&timeline, &req, &ctx).await
}
};
let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
error!("error reading relation or page version: {:?}", e);
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
});
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
self.flush_cancellable(pgb).await?;
pgb.flush().await?;
}
Ok(())
}
@@ -542,9 +486,9 @@ impl PageServerHandler {
// Import basebackup provided via CopyData
info!("importing basebackup");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb).await?;
pgb.flush().await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
timeline
.import_basebackup_from_tar(
&mut copyin_reader,
@@ -597,8 +541,8 @@ impl PageServerHandler {
// Import wal provided via CopyData
info!("importing wal");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
pgb.flush().await?;
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete");
@@ -683,6 +627,7 @@ impl PageServerHandler {
Ok(lsn)
}
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
async fn handle_get_rel_exists_request(
&self,
timeline: &Timeline,
@@ -703,6 +648,7 @@ impl PageServerHandler {
}))
}
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
async fn handle_get_nblocks_request(
&self,
timeline: &Timeline,
@@ -721,6 +667,7 @@ impl PageServerHandler {
}))
}
#[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
async fn handle_db_size_request(
&self,
timeline: &Timeline,
@@ -742,6 +689,7 @@ impl PageServerHandler {
}))
}
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
async fn handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
@@ -806,7 +754,7 @@ impl PageServerHandler {
// switch client to COPYOUT
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
self.flush_cancellable(pgb).await?;
pgb.flush().await?;
// Send a tarball of the latest layer on the timeline. Compress if not
// fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -858,7 +806,7 @@ impl PageServerHandler {
}
pgb.write_message_noflush(&BeMessage::CopyDone)?;
self.flush_cancellable(pgb).await?;
pgb.flush().await?;
let basebackup_after = started
.elapsed()
@@ -1317,10 +1265,7 @@ async fn get_active_tenant_with_timeout(
Ok(tenant) => tenant,
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
Err(GetTenantError::Broken(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
unreachable!("we're calling get_tenant with active=false")
}
};
let wait_time = Duration::from_secs(30);

View File

@@ -37,7 +37,7 @@ impl Key {
| self.field6 as i128
}
pub const fn from_i128(x: i128) -> Self {
pub fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,

View File

@@ -1,6 +1,6 @@
//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
use camino::Utf8Path;
use std::path::Path;
pub enum Statvfs {
Real(nix::sys::statvfs::Statvfs),
@@ -12,13 +12,11 @@ pub enum Statvfs {
// Sincce it should only be a problem on > 2TiB disks, let's ignore
// the problem for now and upcast to u64.
impl Statvfs {
pub fn get(tenants_dir: &Utf8Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
if let Some(mocked) = mocked {
Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
} else {
Ok(Statvfs::Real(nix::sys::statvfs::statvfs(
tenants_dir.as_std_path(),
)?))
Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
}
}
@@ -57,8 +55,8 @@ impl Statvfs {
pub mod mock {
use anyhow::Context;
use camino::Utf8Path;
use regex::Regex;
use std::path::Path;
use tracing::log::info;
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -88,7 +86,7 @@ pub mod mock {
}
}
pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
info!("running mocked statvfs");
match behavior {
@@ -121,7 +119,7 @@ pub mod mock {
}
}
fn walk_dir_disk_usage(path: &Utf8Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
let mut total = 0;
for entry in walkdir::WalkDir::new(path) {
let entry = entry?;

View File

@@ -456,7 +456,7 @@ async fn task_finish(
}
if shutdown_process {
shutdown_pageserver(None, 1).await;
shutdown_pageserver(1).await;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -238,14 +238,14 @@ mod tests {
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
let temp_dir = camino_tempfile::tempdir()?;
let pathbuf = temp_dir.path().join("file");
let temp_dir = tempfile::tempdir()?;
let path = temp_dir.path().join("file");
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Write part (in block to drop the file)
let mut offsets = Vec::new();
{
let file = VirtualFile::create(pathbuf.as_path()).await?;
let file = VirtualFile::create(&path).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let offs = wtr.write_blob(blob).await?;
@@ -258,7 +258,7 @@ mod tests {
wtr.flush_buffer().await?;
}
let file = VirtualFile::open(pathbuf.as_path()).await?;
let file = VirtualFile::open(&path).await?;
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {

View File

@@ -186,21 +186,26 @@ impl FileBlockReader {
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
Ok(write_guard.mark_valid().into())
}
loop {
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => break Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
}
}

View File

@@ -13,7 +13,6 @@ use pageserver_api::models;
use serde::{Deserialize, Serialize};
use std::num::NonZeroU64;
use std::time::Duration;
use utils::generation::Generation;
pub mod defaults {
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
@@ -45,211 +44,7 @@ pub mod defaults {
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub(crate) enum AttachmentMode {
/// Our generation is current as far as we know, and as far as we know we are the only attached
/// pageserver. This is the "normal" attachment mode.
Single,
/// Our generation number is current as far as we know, but we are advised that another
/// pageserver is still attached, and therefore to avoid executing deletions. This is
/// the attachment mode of a pagesever that is the destination of a migration.
Multi,
/// Our generation number is superseded, or about to be superseded. We are advised
/// to avoid remote storage writes if possible, and to avoid sending billing data. This
/// is the attachment mode of a pageserver that is the origin of a migration.
Stale,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub(crate) struct AttachedLocationConfig {
pub(crate) generation: Generation,
pub(crate) attach_mode: AttachmentMode,
// TODO: add a flag to override AttachmentMode's policies under
// disk pressure (i.e. unblock uploads under disk pressure in Stale
// state, unblock deletions after timeout in Multi state)
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub(crate) struct SecondaryLocationConfig {
/// If true, keep the local cache warm by polling remote storage
pub(crate) warm: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub(crate) enum LocationMode {
Attached(AttachedLocationConfig),
Secondary(SecondaryLocationConfig),
}
/// Per-tenant, per-pageserver configuration. All pageservers use the same TenantConf,
/// but have distinct LocationConf.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct LocationConf {
/// The location-specific part of the configuration, describes the operating
/// mode of this pageserver for this tenant.
pub(crate) mode: LocationMode,
/// The pan-cluster tenant configuration, the same on all locations
pub(crate) tenant_conf: TenantConfOpt,
}
impl std::fmt::Debug for LocationConf {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match &self.mode {
LocationMode::Attached(conf) => {
write!(
f,
"Attached {:?}, gen={:?}",
conf.attach_mode, conf.generation
)
}
LocationMode::Secondary(conf) => {
write!(f, "Secondary, warm={}", conf.warm)
}
}
}
}
impl AttachedLocationConfig {
/// Consult attachment mode to determine whether we are currently permitted
/// to delete layers. This is only advisory, not required for data safety.
/// See [`AttachmentMode`] for more context.
pub(crate) fn may_delete_layers_hint(&self) -> bool {
// TODO: add an override for disk pressure in AttachedLocationConfig,
// and respect it here.
match &self.attach_mode {
AttachmentMode::Single => true,
AttachmentMode::Multi | AttachmentMode::Stale => {
// In Multi mode we avoid doing deletions because some other
// attached pageserver might get 404 while trying to read
// a layer we delete which is still referenced in their metadata.
//
// In Stale mode, we avoid doing deletions because we expect
// that they would ultimately fail validation in the deletion
// queue due to our stale generation.
false
}
}
}
/// Whether we are currently hinted that it is worthwhile to upload layers.
/// This is only advisory, not required for data safety.
/// See [`AttachmentMode`] for more context.
pub(crate) fn may_upload_layers_hint(&self) -> bool {
// TODO: add an override for disk pressure in AttachedLocationConfig,
// and respect it here.
match &self.attach_mode {
AttachmentMode::Single | AttachmentMode::Multi => true,
AttachmentMode::Stale => {
// In Stale mode, we avoid doing uploads because we expect that
// our replacement pageserver will already have started its own
// IndexPart that will never reference layers we upload: it is
// wasteful.
false
}
}
}
}
impl LocationConf {
/// For use when loading from a legacy configuration: presence of a tenant
/// implies it is in AttachmentMode::Single, which used to be the only
/// possible state. This function should eventually be removed.
pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
Self {
mode: LocationMode::Attached(AttachedLocationConfig {
generation,
attach_mode: AttachmentMode::Single,
}),
tenant_conf,
}
}
/// For use when attaching/re-attaching: update the generation stored in this
/// structure. If we were in a secondary state, promote to attached (posession
/// of a fresh generation implies this).
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
match &mut self.mode {
LocationMode::Attached(attach_conf) => {
attach_conf.generation = generation;
}
LocationMode::Secondary(_) => {
// We are promoted to attached by the control plane's re-attach response
self.mode = LocationMode::Attached(AttachedLocationConfig {
generation,
attach_mode: AttachmentMode::Single,
})
}
}
}
pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?;
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
conf.generation
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
}
let mode = match &conf.mode {
models::LocationConfigMode::AttachedMulti => {
LocationMode::Attached(AttachedLocationConfig {
generation: get_generation(conf)?,
attach_mode: AttachmentMode::Multi,
})
}
models::LocationConfigMode::AttachedSingle => {
LocationMode::Attached(AttachedLocationConfig {
generation: get_generation(conf)?,
attach_mode: AttachmentMode::Single,
})
}
models::LocationConfigMode::AttachedStale => {
LocationMode::Attached(AttachedLocationConfig {
generation: get_generation(conf)?,
attach_mode: AttachmentMode::Stale,
})
}
models::LocationConfigMode::Secondary => {
anyhow::ensure!(conf.generation.is_none());
let warm = conf
.secondary_conf
.as_ref()
.map(|c| c.warm)
.unwrap_or(false);
LocationMode::Secondary(SecondaryLocationConfig { warm })
}
models::LocationConfigMode::Detached => {
// Should not have been called: API code should translate this mode
// into a detach rather than trying to decode it as a LocationConf
return Err(anyhow::anyhow!("Cannot decode a Detached configuration"));
}
};
Ok(Self { mode, tenant_conf })
}
}
impl Default for LocationConf {
// TODO: this should be removed once tenant loading can guarantee that we are never
// loading from a directory without a configuration.
// => tech debt since https://github.com/neondatabase/neon/issues/1555
fn default() -> Self {
Self {
mode: LocationMode::Attached(AttachedLocationConfig {
generation: Generation::none(),
attach_mode: AttachmentMode::Single,
}),
tenant_conf: TenantConfOpt::default(),
}
}
}
/// A tenant's calcuated configuration, which is the result of merging a
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
///
/// For storing and transmitting individual tenant's configuration, see
/// TenantConfOpt.
/// Per-tenant configuration options
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct TenantConf {
// Flush out an inmemory layer, if it's holding WAL older than this

View File

@@ -1,7 +1,9 @@
use std::sync::Arc;
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::TenantState;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
@@ -60,7 +62,7 @@ fn remote_tenant_delete_mark_path(
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.context("tenant path")?;
Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
Ok(tenant_remote_path.join(Path::new("deleted")))
}
async fn create_remote_delete_mark(
@@ -146,7 +148,7 @@ async fn schedule_ordered_timeline_deletions(
Ok(already_running_deletions)
}
async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
// Assert timelines dir is empty.
if !fs_ext::is_directory_empty(timelines_path).await? {
// Display first 10 items in directory
@@ -186,18 +188,20 @@ async fn cleanup_remaining_fs_traces(
conf: &PageServerConf,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
let rm = |p: PathBuf, is_dir: bool| async move {
if is_dir {
tokio::fs::remove_dir(&p).await
} else {
tokio::fs::remove_file(&p).await
}
.or_else(fs_ext::ignore_not_found)
.with_context(|| format!("failed to delete {p}"))
.with_context(|| {
let to_display = p.display();
format!("failed to delete {to_display}")
})
};
rm(conf.tenant_config_path(tenant_id), false).await?;
rm(conf.tenant_location_config_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
Err(anyhow::anyhow!(

View File

@@ -6,11 +6,11 @@ use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
use camino::Utf8PathBuf;
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::path::PathBuf;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::{TenantId, TimelineId};
@@ -40,9 +40,7 @@ impl EphemeralFile {
let filename = conf
.timeline_path(&tenant_id, &timeline_id)
.join(Utf8PathBuf::from(format!(
"ephemeral-{filename_disambiguator}"
)));
.join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));
let file = VirtualFile::open_with_options(
&filename,
@@ -72,32 +70,38 @@ impl EphemeralFile {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum, self.file.path, e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
};
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.file.path.display(),
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -167,7 +171,7 @@ impl EphemeralFile {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
let _ = write_guard.mark_valid();
write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
@@ -191,7 +195,7 @@ impl EphemeralFile {
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path,
self.ephemeral_file.file.path.display(),
),
));
}
@@ -254,7 +258,8 @@ impl Drop for EphemeralFile {
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.file.path, e
self.file.path.display(),
e
);
}
}

View File

@@ -1,9 +1,10 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use camino::{Utf8Path, Utf8PathBuf};
use rand::{distributions::Alphanumeric, Rng};
use std::collections::{hash_map, HashMap};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::fs;
@@ -19,16 +20,11 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::control_plane_client::ControlPlaneClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{
create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
};
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
@@ -40,39 +36,6 @@ use super::delete::DeleteTenantError;
use super::timeline::delete::DeleteTimelineFlow;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
/// - `Attached`: has a full Tenant object, is elegible to service
/// reads and ingest WAL.
/// - `Secondary`: is only keeping a local cache warm.
///
/// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
/// that way we avoid having to carefully switch a tenant's ingestion etc on and off during
/// its lifetime, and we can preserve some important safety invariants like `Tenant` always
/// having a properly acquired generation (Secondary doesn't need a generation)
#[derive(Clone)]
pub enum TenantSlot {
Attached(Arc<Tenant>),
Secondary,
}
impl TenantSlot {
/// Return the `Tenant` in this slot if attached, else None
fn get_attached(&self) -> Option<&Arc<Tenant>> {
match self {
Self::Attached(t) => Some(t),
Self::Secondary => None,
}
}
/// Consume self and return the `Tenant` that was in this slot if attached, else None
fn into_attached(self) -> Option<Arc<Tenant>> {
match self {
Self::Attached(t) => Some(t),
Self::Secondary => None,
}
}
}
/// The tenants known to the pageserver.
/// The enum variants are used to distinguish the different states that the pageserver can be in.
pub(crate) enum TenantsMap {
@@ -80,27 +43,14 @@ pub(crate) enum TenantsMap {
Initializing,
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
/// New tenants can be added using [`tenant_map_insert`].
Open(HashMap<TenantId, TenantSlot>),
Open(HashMap<TenantId, Arc<Tenant>>),
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
/// Existing tenants are still accessible, but no new tenants can be created.
ShuttingDown(HashMap<TenantId, TenantSlot>),
ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
}
impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
/// None is returned.
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
m.get(tenant_id).and_then(TenantSlot::get_attached)
}
}
}
/// Get the contents of the map at this tenant ID, even if it is in secondary state.
pub(crate) fn get_slot(&self, tenant_id: &TenantId) -> Option<&TenantSlot> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
@@ -109,9 +59,7 @@ impl TenantsMap {
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
m.remove(tenant_id).and_then(TenantSlot::into_attached)
}
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
}
}
}
@@ -122,12 +70,12 @@ impl TenantsMap {
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
let tmp_path = safe_rename_tenant_dir(path).await?;
fs::remove_dir_all(tmp_path).await
}
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
async fn safe_rename_tenant_dir(path: impl AsRef<Path>) -> std::io::Result<PathBuf> {
let parent = path
.as_ref()
.parent()
@@ -144,155 +92,13 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
.collect::<String>()
+ TEMP_FILE_SUFFIX;
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
fs::rename(path.as_ref(), &tmp_path).await?;
fs::rename(&path, &tmp_path).await?;
fs::File::open(parent).await?.sync_all().await?;
Ok(tmp_path)
}
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
fn emergency_generations(
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantId, Generation> {
tenant_confs
.iter()
.filter_map(|(tid, lc)| {
let lc = match lc {
Ok(lc) => lc,
Err(_) => return None,
};
let gen = match &lc.mode {
LocationMode::Attached(alc) => Some(alc.generation),
LocationMode::Secondary(_) => None,
};
gen.map(|g| (*tid, g))
})
.collect()
}
async fn init_load_generations(
conf: &'static PageServerConf,
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
resources: &TenantSharedResources,
cancel: &CancellationToken,
) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
let generations = if conf.control_plane_emergency_mode {
error!(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
);
emergency_generations(tenant_confs)
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
info!("Calling control plane API to re-attach tenants");
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach().await {
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
}
} else {
info!("Control plane API not configured, tenant generations are disabled");
return Ok(None);
};
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(generations.clone())
.await?;
}
Ok(Some(generations))
}
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
/// and load configurations for the tenants we found.
async fn init_load_tenant_configs(
conf: &'static PageServerConf,
) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
let tenants_dir = conf.tenants_path();
let mut dir_entries = tenants_dir
.read_dir_utf8()
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
let mut configs = HashMap::new();
loop {
match dir_entries.next() {
None => break,
Some(Ok(dentry)) => {
let tenant_dir_path = dentry.path().to_path_buf();
if crate::is_temporary(&tenant_dir_path) {
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
// No need to use safe_remove_tenant_dir_all because this is already
// a temporary path
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path, e
);
}
continue;
}
// This case happens if we:
// * crash during attach before creating the attach marker file
// * crash during tenant delete before removing tenant directory
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path
)
}
continue;
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
let tenant_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
.parse::<TenantId>()
{
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
);
continue;
}
};
configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
}
Some(Err(e)) => {
// An error listing the top level directory indicates serious problem
// with local filesystem: we will fail to load, and fail to start.
anyhow::bail!(e);
}
}
}
Ok(configs)
}
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
@@ -303,96 +109,136 @@ pub async fn init_tenant_mgr(
init_order: InitializationOrder,
cancel: CancellationToken,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let tenants_dir = conf.tenants_path();
let mut tenants = HashMap::new();
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
Some(client.re_attach().await?)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
};
let mut dir_entries = fs::read_dir(&tenants_dir)
.await
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
// Scan local filesystem for attached tenants
let tenant_configs = init_load_tenant_configs(conf).await?;
loop {
match dir_entries.next_entry().await {
Ok(None) => break,
Ok(Some(dir_entry)) => {
let tenant_dir_path = dir_entry.path();
if crate::is_temporary(&tenant_dir_path) {
info!(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
// No need to use safe_remove_tenant_dir_all because this is already
// a temporary path
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path.display(),
e
);
}
} else {
// This case happens if we:
// * crash during attach before creating the attach marker file
// * crash during tenant delete before removing tenant directory
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
continue;
}
// Determine which tenants are to be attached
let tenant_generations =
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
// Construct `Tenant` objects and start them running
for (tenant_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_id);
let tenant_id = match tenant_dir_path
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TenantId>()
{
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): {}",
tenant_dir_path.display()
);
continue;
}
};
let mut location_conf = match location_conf {
Ok(l) => l,
Err(e) => {
warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_id) {
*gen
} else {
info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove detached tenant directory '{}': {:?}",
tenant_dir_path.display(),
e
);
}
continue;
}
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(
"Starting tenant {} in legacy mode, no generation",
tenant_dir_path.display()
);
Generation::none()
};
tenants.insert(
tenant_id,
TenantSlot::Attached(Tenant::create_broken_tenant(
match schedule_local_tenant_processing(
conf,
tenant_id,
format!("{}", e),
)),
);
continue;
}
};
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_id) {
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(_) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(%tenant_id, "Loaded tenant in secondary mode");
tenants.insert(tenant_id, TenantSlot::Secondary);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
// instruct us about secondary attachments. That way, instead of throwing
// away local state, we can gracefully fall back to secondary here, if the control
// plane tells us so.
// (https://github.com/neondatabase/neon/issues/5377)
info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(%tenant_id,
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
&tenant_dir_path,
generation,
resources.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant.tenant_id(), tenant);
}
Err(e) => {
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
}
}
};
continue;
}
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
Generation::none()
};
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
match schedule_local_tenant_processing(
conf,
tenant_id,
&tenant_dir_path,
AttachedTenantConf::try_from(location_conf)?,
resources.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
}
}
Err(e) => {
error!(%tenant_id, "Failed to start tenant: {e:#}");
// On error, print it, but continue with the other tenants. If we error out
// here, the pageserver startup fails altogether, causing outage for *all*
// tenants. That seems worse.
error!(
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
);
}
}
}
@@ -409,8 +255,8 @@ pub async fn init_tenant_mgr(
pub(crate) fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_id: TenantId,
tenant_path: &Utf8Path,
location_conf: AttachedTenantConf,
tenant_path: &Path,
generation: Generation,
resources: TenantSharedResources,
init_order: Option<InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -439,33 +285,35 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if resources.remote_storage.is_none() {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
if let Some(remote_storage) = resources.remote_storage {
match Tenant::spawn_attach(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
} else {
match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) {
generation,
resources.broker_client,
tenants,
remote_storage,
ctx,
) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
}
}
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(
conf,
tenant_id,
location_conf,
resources,
init_order,
tenants,
ctx,
conf, tenant_id, generation, resources, init_order, tenants, ctx,
)
};
Ok(tenant)
@@ -521,16 +369,7 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
let res = {
let (_guard, shutdown_progress) = completion::channel();
match tenant {
TenantSlot::Attached(t) => {
t.shutdown(shutdown_progress, freeze_and_flush).await
}
TenantSlot::Secondary => {
// TODO: once secondary mode downloads are implemented,
// ensure they have all stopped before we reach this point.
Ok(())
}
}
tenant.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
@@ -599,23 +438,25 @@ pub async fn create_tenant(
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
let location_conf = LocationConf::attached_single(tenant_conf, generation);
// We're holding the tenants lock in write mode while doing local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
// and do the work in that state.
let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_resources = TenantSharedResources {
broker_client,
remote_storage,
};
let created_tenant =
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
generation, tenant_resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -644,137 +485,20 @@ pub async fn set_new_tenant_config(
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_id, true).await?;
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
let tenant_config_path = conf.tenant_config_path(&tenant_id);
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}
#[instrument(skip_all, fields(tenant_id, new_location_config))]
pub(crate) async fn upsert_location(
conf: &'static PageServerConf,
tenant_id: TenantId,
new_location_config: LocationConf,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), anyhow::Error> {
info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
let mut existing_tenant = match get_tenant(tenant_id, false).await {
Ok(t) => Some(t),
Err(GetTenantError::NotFound(_)) => None,
Err(e) => anyhow::bail!(e),
};
// If we need to shut down a Tenant, do that first
let shutdown_tenant = match (&new_location_config.mode, &existing_tenant) {
(LocationMode::Secondary(_), Some(t)) => Some(t),
(LocationMode::Attached(attach_conf), Some(t)) => {
if attach_conf.generation != t.generation {
Some(t)
} else {
None
}
}
_ => None,
};
// TODO: currently we risk concurrent operations interfering with the tenant
// while we await shutdown, but we also should not hold the TenantsMap lock
// across the whole operation. Before we start using this function in production,
// a follow-on change will revise how concurrency is handled in TenantsMap.
// (https://github.com/neondatabase/neon/issues/5378)
if let Some(tenant) = shutdown_tenant {
let (_guard, progress) = utils::completion::channel();
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
existing_tenant = None;
}
if let Some(tenant) = existing_tenant {
// Update the existing tenant
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_location_config(AttachedTenantConf::try_from(new_location_config)?);
} else {
// Upsert a fresh TenantSlot into TenantsMap. Do it within the map write lock,
// and re-check that the state of anything we are replacing is as expected.
tenant_map_upsert_slot(tenant_id, |old_value| async move {
if let Some(TenantSlot::Attached(t)) = old_value {
if !matches!(t.current_state(), TenantState::Stopping { .. }) {
anyhow::bail!("Tenant state changed during location configuration update");
}
}
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => TenantSlot::Secondary,
LocationMode::Attached(_attach_config) => {
// Do a schedule_local_tenant_processing
// FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
// we have the same problem in load_tenant/attach_tenant. Probably
// need a lock in TenantSlot to fix this.
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let tenant_path = conf.tenant_path(&tenant_id);
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client,
};
let new_tenant = schedule_local_tenant_processing(
conf,
tenant_id,
&tenant_path,
AttachedTenantConf::try_from(new_location_config)?,
resources,
None,
&TENANTS,
ctx,
)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
TenantSlot::Attached(new_tenant)
}
};
Ok(new_slot)
})
.await?;
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub enum GetTenantError {
#[error("Tenant {0} not found")]
NotFound(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantId),
/// Broken is logically a subset of NotActive, but a distinct error is useful as
/// NotActive is usually a retryable state for API purposes, whereas Broken
/// is a stuck error state
#[error("Tenant is broken: {0}")]
Broken(String),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
@@ -789,20 +513,10 @@ pub async fn get_tenant(
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
if active_only && !tenant.is_active() {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
@@ -875,7 +589,7 @@ async fn detach_tenant0(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
detach_ignored: bool,
) -> Result<Utf8PathBuf, TenantStateError> {
) -> Result<PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
safe_rename_tenant_dir(&local_tenant_directory)
@@ -908,7 +622,6 @@ pub async fn load_tenant(
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -922,14 +635,8 @@ pub async fn load_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client
};
let mut location_conf = Tenant::load_tenant_config(conf, &tenant_id).map_err( TenantMapInsertError::Other)?;
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
@@ -982,10 +689,7 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Secondary => None,
})
.map(|(id, tenant)| (*id, tenant.current_state()))
.collect())
}
@@ -998,12 +702,12 @@ pub async fn attach_tenant(
tenant_id: TenantId,
generation: Generation,
tenant_conf: TenantConfOpt,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
@@ -1014,7 +718,11 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
let resources = TenantSharedResources {
broker_client,
remote_storage: Some(remote_storage),
};
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -1037,10 +745,8 @@ pub enum TenantMapInsertError {
ShuttingDown,
#[error("tenant {0} already exists, state: {1:?}")]
TenantAlreadyExists(TenantId, TenantState),
#[error("tenant {0} already exists in secondary state")]
TenantExistsSecondary(TenantId),
#[error(transparent)]
Other(#[from] anyhow::Error),
Closure(#[from] anyhow::Error),
}
/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
@@ -1064,47 +770,20 @@ where
TenantsMap::Open(m) => m,
};
match m.entry(tenant_id) {
hash_map::Entry::Occupied(e) => match e.get() {
TenantSlot::Attached(t) => Err(TenantMapInsertError::TenantAlreadyExists(
tenant_id,
t.current_state(),
)),
TenantSlot::Secondary => Err(TenantMapInsertError::TenantExistsSecondary(tenant_id)),
},
hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
tenant_id,
e.get().current_state(),
)),
hash_map::Entry::Vacant(v) => match insert_fn().await {
Ok(tenant) => {
v.insert(TenantSlot::Attached(tenant.clone()));
v.insert(tenant.clone());
Ok(tenant)
}
Err(e) => Err(TenantMapInsertError::Other(e)),
Err(e) => Err(TenantMapInsertError::Closure(e)),
},
}
}
async fn tenant_map_upsert_slot<'a, F, R>(
tenant_id: TenantId,
upsert_fn: F,
) -> Result<(), TenantMapInsertError>
where
F: FnOnce(Option<TenantSlot>) -> R,
R: std::future::Future<Output = anyhow::Result<TenantSlot>>,
{
let mut guard = TENANTS.write().await;
let m = match &mut *guard {
TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
TenantsMap::Open(m) => m,
};
match upsert_fn(m.remove(&tenant_id)).await {
Ok(upsert_val) => {
m.insert(tenant_id, upsert_val);
Ok(())
}
Err(e) => Err(TenantMapInsertError::Other(e)),
}
}
/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
@@ -1124,40 +803,28 @@ where
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
let tenant = {
match tenants
tenants
.write()
.await
.get_slot(&tenant_id)
.get(&tenant_id)
.cloned()
.ok_or(TenantStateError::NotFound(tenant_id))?
{
TenantSlot::Attached(t) => Some(t.clone()),
TenantSlot::Secondary => None,
}
};
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// If the tenant was attached, shut it down gracefully. For secondary
// locations this part is not necessary
match tenant {
Some(attached_tenant) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match attached_tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
// wait for it but return an error right away because these are distinct requests.
return Err(TenantStateError::IsStopping(tenant_id));
}
}
}
None => {
// Nothing to wait on when not attached, proceed.
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
// wait for it but return an error right away because these are distinct requests.
return Err(TenantStateError::IsStopping(tenant_id));
}
}
@@ -1248,8 +915,6 @@ mod tests {
use std::sync::Arc;
use tracing::{info_span, Instrument};
use crate::tenant::mgr::TenantSlot;
use super::{super::harness::TenantHarness, TenantsMap};
#[tokio::test(start_paused = true)]
@@ -1271,7 +936,7 @@ mod tests {
// tenant harness configures the logging and we cannot escape it
let _e = info_span!("testing", tenant_id = %id).entered();
let tenants = HashMap::from([(id, TenantSlot::Attached(t.clone()))]);
let tenants = HashMap::from([(id, t.clone())]);
let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();

View File

@@ -1,17 +1,16 @@
use std::{
io,
path::{Path, PathBuf},
sync::atomic::{AtomicUsize, Ordering},
};
use camino::{Utf8Path, Utf8PathBuf};
fn fsync_path(path: &Utf8Path) -> io::Result<()> {
fn fsync_path(path: &Path) -> io::Result<()> {
// TODO use VirtualFile::fsync_all once we fully go async.
let file = std::fs::File::open(path)?;
file.sync_all()
}
fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
fsync_path(path)?;
}
@@ -19,7 +18,7 @@ fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Re
Ok(())
}
fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
// TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
/// Use at most this number of threads.
@@ -48,7 +47,7 @@ fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
}
/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
if paths.len() == 1 {
fsync_path(&paths[0])?;
return Ok(());
@@ -59,7 +58,7 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
const MAX_CONCURRENT_FSYNC: usize = 64;
let mut next = paths.iter().peekable();
let mut js = tokio::task::JoinSet::new();

View File

@@ -116,12 +116,8 @@
//! # Completion
//!
//! Once an operation has completed, we update
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
//! and submit a request through the DeletionQueue to update
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
//! validated that our generation is not stale. It is this visible value
//! that is advertized to safekeepers as a signal that that they can
//! delete the WAL up to that LSN.
//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
//! to safekeepers that they can delete the WAL up to that LSN.
//!
//! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
//! for all pending operations to complete. It does not prevent more
@@ -204,12 +200,12 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod delete;
mod download;
pub mod index;
mod upload;
use anyhow::Context;
use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
@@ -220,6 +216,7 @@ use utils::backoff::{
};
use std::collections::{HashMap, VecDeque};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -328,8 +324,6 @@ pub struct RemoteTimelineClient {
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
}
impl RemoteTimelineClient {
@@ -341,7 +335,6 @@ impl RemoteTimelineClient {
///
pub fn new(
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -359,7 +352,6 @@ impl RemoteTimelineClient {
timeline_id,
generation,
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
@@ -421,24 +413,13 @@ impl RemoteTimelineClient {
Ok(())
}
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
match &*self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
UploadQueue::Stopped(q) => q
.upload_queue_for_deletion
.get_last_remote_consistent_lsn_projected(),
}
}
pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
UploadQueue::Stopped(q) => Some(
q.upload_queue_for_deletion
.get_last_remote_consistent_lsn_visible(),
),
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
UploadQueue::Stopped(q) => {
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
}
}
}
@@ -453,11 +434,11 @@ impl RemoteTimelineClient {
} else {
0
};
self.metrics.remote_physical_size_set(size);
self.metrics.remote_physical_size_gauge().set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_get()
self.metrics.remote_physical_size_gauge().get()
}
//
@@ -662,7 +643,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: Vec<LayerFileName>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -682,10 +663,10 @@ impl RemoteTimelineClient {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(&name);
let meta = upload_queue.latest_files.remove(name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -707,16 +688,18 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
for (name, generation) in with_generations {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: false,
generation,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {name}");
}
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
@@ -850,7 +833,9 @@ impl RemoteTimelineClient {
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
let layers: Vec<RemotePath> = {
let (mut receiver, deletions_queued) = {
let mut deletions_queued = 0;
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
@@ -862,30 +847,42 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
.latest_files
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_id,
&self.timeline_id,
&file_name,
meta.generation,
)
})
.collect()
.queued_operations
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
// schedule the actual deletions
for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: true,
generation: meta.generation,
});
self.calls_unfinished_metric_begin(&op);
stopped
.upload_queue_for_deletion
.queued_operations
.push_back(op);
info!("scheduled layer file deletion {name}");
deletions_queued += 1;
}
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
(
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
deletions_queued,
)
};
let layer_deletion_count = layers.len();
self.deletion_queue_client.push_immediate(layers).await?;
receiver.changed().await.context("upload queue shut down")?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.deletion_queue_client.flush_immediate().await?;
let remaining = backoff::retry(
|| async {
self.storage_impl
@@ -901,27 +898,9 @@ impl RemoteTimelineClient {
.await
.context("list prefixes")?;
// We will delete the current index_part object last, since it acts as a deletion
// marker via its deleted_at attribute
let latest_index = remaining
.iter()
.filter(|p| {
p.object_name()
.map(|n| n.starts_with(IndexPart::FILE_NAME))
.unwrap_or(false)
})
.filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
.max_by_key(|i| i.1)
.map(|i| i.0.clone())
.unwrap_or(
// No generation-suffixed indices, assume we are dealing with
// a legacy index.
remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
);
let remaining_layers: Vec<RemotePath> = remaining
let remaining: Vec<RemotePath> = remaining
.into_iter()
.filter(|p| p!= &latest_index)
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
.inspect(|path| {
if let Some(name) = path.object_name() {
info!(%name, "deleting a file not referenced from index_part.json");
@@ -931,11 +910,17 @@ impl RemoteTimelineClient {
})
.collect();
let not_referenced_count = remaining_layers.len();
if !remaining_layers.is_empty() {
self.deletion_queue_client
.push_immediate(remaining_layers)
.await?;
if !remaining.is_empty() {
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -944,14 +929,20 @@ impl RemoteTimelineClient {
))?
});
debug!("enqueuing index part deletion");
self.deletion_queue_client
.push_immediate([latest_index].to_vec())
.await?;
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.deletion_queue_client.flush_immediate().await?;
debug!("deleting index part");
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -959,7 +950,7 @@ impl RemoteTimelineClient {
))?
});
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
Ok(())
}
@@ -1149,16 +1140,21 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => self
.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e)),
UploadOp::Delete(delete) => {
let path = &self
.conf
.timeline_path(&self.tenant_id, &self.timeline_id)
.join(delete.layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
delete.file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1214,12 +1210,18 @@ impl RemoteTimelineClient {
}
// The task has completed successfully. Remove it from the in-progress list.
let lsn_update = {
{
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
UploadQueue::Stopped(_stopped) => {
None
UploadQueue::Stopped(stopped) => {
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
// then stop() took care of it so we just return.
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
match &task.op {
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
_ => None
}
},
UploadQueue::Initialized(qi) => { Some(qi) }
};
@@ -1234,51 +1236,23 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
match task.op {
UploadOp::UploadLayer(_, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
UploadOp::UploadMetadata(_, lsn) => {
upload_queue.num_inprogress_metadata_uploads -= 1;
// XXX monotonicity check?
upload_queue.projected_remote_consistent_lsn = Some(lsn);
if self.generation.is_none() {
// Legacy mode: skip validating generation
upload_queue.visible_remote_consistent_lsn.store(lsn);
None
} else {
Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
}
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
None
}
UploadOp::Barrier(_) => unreachable!(),
};
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
lsn_update
};
if let Some((lsn, slot)) = lsn_update {
// Updates to the remote_consistent_lsn we advertise to pageservers
// are all routed through the DeletionQueue, to enforce important
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
self.deletion_queue_client
.update_remote_consistent_lsn(
self.tenant_id,
self.timeline_id,
self.generation,
lsn,
slot,
)
.await;
}
self.calls_unfinished_metric_end(&task.op);
}
@@ -1304,8 +1278,8 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(_delete) => (
RemoteOpFileKind::Layer,
UploadOp::Delete(delete) => (
delete.file_kind,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
@@ -1367,10 +1341,7 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
.clone(),
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
@@ -1427,20 +1398,20 @@ pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
}
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
remote_timelines_path(tenant_id).join(&PathBuf::from(timeline_id.to_string()))
}
pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
generation: Generation,
layer_meta: &LayerFileMetadata,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
generation.get_suffix()
layer_meta.generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1470,7 +1441,14 @@ pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
}
};
match file_name.split_once('-') {
let file_name_str = match file_name.to_str() {
Some(s) => s,
None => {
tracing::warn!("Malformed index key {:?}", path);
return None;
}
};
match file_name_str.split_once('-') {
Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
None => None,
}
@@ -1482,16 +1460,20 @@ pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
/// Errors if the path provided does not start from pageserver's workdir.
pub fn remote_path(
conf: &PageServerConf,
local_path: &Utf8Path,
local_path: &Path,
generation: Generation,
) -> anyhow::Result<RemotePath> {
let stripped = local_path
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")?;
let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
let suffixed = format!(
"{0}{1}",
stripped.to_string_lossy(),
generation.get_suffix()
);
RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
format!(
"to resolve remote part of path {:?} for base {:?}",
local_path, conf.workdir
@@ -1511,7 +1493,7 @@ mod tests {
DEFAULT_PG_VERSION,
};
use std::collections::HashSet;
use std::{collections::HashSet, path::Path};
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1545,7 +1527,7 @@ mod tests {
assert_eq!(avec, bvec);
}
fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path, generation: Generation) {
fn assert_remote_files(expected: &[&str], remote_path: &Path, generation: Generation) {
let mut expected: Vec<String> = expected
.iter()
.map(|x| format!("{}{}", x, generation.get_suffix()))
@@ -1572,6 +1554,7 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
// Use a current-thread runtime in the test
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
@@ -1597,7 +1580,6 @@ mod tests {
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_id,
@@ -1664,12 +1646,12 @@ mod tests {
let timeline_path = harness.timeline_path(&TIMELINE_ID);
println!("workdir: {}", harness.conf.workdir);
println!("workdir: {}", harness.conf.workdir.display());
let remote_timeline_dir = harness
.remote_fs_dir
.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
println!("remote_timeline_dir: {remote_timeline_dir}");
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
let generation = harness.generation;
@@ -1767,7 +1749,7 @@ mod tests {
)
.unwrap();
client
.schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -1793,7 +1775,6 @@ mod tests {
// Finish them
client.wait_completion().await.unwrap();
harness.deletion_queue.pump().await;
assert_remote_files(
&[
@@ -1916,7 +1897,7 @@ mod tests {
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
);
eprintln!("Writing {index_path}");
eprintln!("Writing {}", index_path.display());
std::fs::write(&index_path, index_part_bytes).unwrap();
example_index_part
}

View File

@@ -0,0 +1,34 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::{
config::PageServerConf,
tenant::{remote_timeline_client::remote_path, Generation},
};
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
generation: Generation,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
let path_to_delete = remote_path(conf, local_layer_path, generation)?;
// We don't want to print an error if the delete failed if the file has
// already been deleted. Thankfully, in this situation S3 already
// does not yield an error. While OS-provided local file system APIs do yield
// errors, we avoid them in the `LocalFs` wrapper.
storage
.delete(&path_to_delete)
.await
.with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
}

View File

@@ -5,10 +5,10 @@
use std::collections::HashSet;
use std::future::Future;
use std::path::Path;
use std::time::Duration;
use anyhow::{anyhow, Context};
use camino::Utf8Path;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
@@ -50,12 +50,7 @@ pub async fn download_layer_file<'a>(
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_id,
&timeline_id,
layer_file_name,
layer_metadata.generation,
);
let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
@@ -74,7 +69,12 @@ pub async fn download_layer_file<'a>(
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path)
.await
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
.with_context(|| {
format!(
"create a destination file for layer '{}'",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let mut download = storage
.download(&remote_path)
@@ -116,7 +116,7 @@ pub async fn download_layer_file<'a>(
destination_file
.flush()
.await
.with_context(|| format!("flush source file at {temp_file_path}"))
.with_context(|| format!("flush source file at {}", temp_file_path.display()))
.map_err(DownloadError::Other)?;
let expected = layer_metadata.file_size();
@@ -130,7 +130,12 @@ pub async fn download_layer_file<'a>(
destination_file
.sync_all()
.await
.with_context(|| format!("failed to fsync source file at {temp_file_path}"))
.with_context(|| {
format!(
"failed to fsync source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
drop(destination_file);
@@ -142,23 +147,27 @@ pub async fn download_layer_file<'a>(
fs::rename(&temp_file_path, &local_path)
.await
.with_context(|| format!("rename download layer file to {local_path}"))
.with_context(|| format!("rename download layer file to {}", local_path.display(),))
.map_err(DownloadError::Other)?;
crashsafe::fsync_async(&local_path)
.await
.with_context(|| format!("fsync layer file {local_path}"))
.with_context(|| format!("fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;
tracing::debug!("download complete: {local_path}");
tracing::debug!("download complete: {}", local_path.display());
Ok(bytes_amount)
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Utf8Path) -> bool {
let extension = path.extension();
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,

View File

@@ -1,9 +1,8 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use camino::Utf8Path;
use fail::fail_point;
use std::io::ErrorKind;
use std::{io::ErrorKind, path::Path};
use tokio::fs;
use super::Generation;
@@ -31,7 +30,6 @@ pub(super) async fn upload_index_part<'a>(
fail_point!("before-upload-index", |_| {
bail!("failpoint before-upload-index")
});
pausable_failpoint!("before-upload-index-pausable");
let index_part_bytes =
serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
@@ -52,7 +50,7 @@ pub(super) async fn upload_index_part<'a>(
pub(super) async fn upload_timeline_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
source_path: &'a Utf8Path,
source_path: &'a Path,
known_metadata: &'a LayerFileMetadata,
generation: Generation,
) -> anyhow::Result<()> {
@@ -70,7 +68,7 @@ pub(super) async fn upload_timeline_layer<'a>(
// upload. However, a nonexistent file can also be indicative of
// something worse, like when a file is scheduled for upload before
// it has been written to disk yet.
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
return Ok(());
}
Err(e) => {
@@ -95,7 +93,7 @@ pub(super) async fn upload_timeline_layer<'a>(
storage
.upload(source_file, fs_size, &storage_path, None)
.await
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
.with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;
Ok(())
}

View File

@@ -14,7 +14,6 @@ use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use anyhow::Result;
use bytes::Bytes;
use camino::Utf8PathBuf;
use enum_map::EnumMap;
use enumset::EnumSet;
use once_cell::sync::Lazy;
@@ -23,6 +22,7 @@ use pageserver_api::models::{
HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
};
use std::ops::Range;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
@@ -378,7 +378,7 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
// Path to the layer file in the local filesystem.
// `None` for `RemoteLayer`.
fn local_path(&self) -> Option<Utf8PathBuf>;
fn local_path(&self) -> Option<PathBuf>;
/// Permanently remove this layer from disk.
fn delete_resident_layer_file(&self) -> Result<()>;
@@ -456,7 +456,7 @@ pub mod tests {
/// config. In that case, we use the Path variant to hold the full path to the file on
/// disk.
enum PathOrConf {
Path(Utf8PathBuf),
Path(PathBuf),
Conf(&'static PageServerConf),
}

View File

@@ -41,7 +41,6 @@ use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -49,6 +48,7 @@ use std::fs::{self, File};
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::sync::OnceCell;
use tracing::*;
@@ -267,7 +267,7 @@ impl PersistentLayer for DeltaLayer {
Some(self)
}
fn local_path(&self) -> Option<Utf8PathBuf> {
fn local_path(&self) -> Option<PathBuf> {
self.local_path()
}
@@ -374,7 +374,7 @@ impl DeltaLayer {
.await
}
pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
pub(crate) fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
@@ -409,7 +409,7 @@ impl DeltaLayer {
tenant_id: &TenantId,
timeline_id: &TimelineId,
fname: &DeltaFileName,
) -> Utf8PathBuf {
) -> PathBuf {
match path_or_conf {
PathOrConf::Path(path) => path.clone(),
PathOrConf::Conf(conf) => conf
@@ -424,7 +424,7 @@ impl DeltaLayer {
timeline_id: &TimelineId,
key_start: Key,
lsn_range: &Range<Lsn>,
) -> Utf8PathBuf {
) -> PathBuf {
let rand_string: String = rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(8)
@@ -455,7 +455,7 @@ impl DeltaLayer {
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.await
.with_context(|| format!("Failed to load delta layer {}", self.path()))
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
@@ -471,7 +471,7 @@ impl DeltaLayer {
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
let actual_filename = path.file_name().unwrap().to_owned();
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
@@ -510,8 +510,9 @@ impl DeltaLayer {
/// Create a DeltaLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
pub fn new_for_path(path: &Utf8Path, file: File) -> Result<Self> {
let mut summary_buf = vec![0; PAGE_SZ];
pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
file.read_exact_at(&mut summary_buf, 0)?;
let summary = Summary::des_prefix(&summary_buf)?;
@@ -537,7 +538,7 @@ impl DeltaLayer {
self.desc.delta_file_name()
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> Utf8PathBuf {
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
&self.desc.tenant_id,
@@ -572,7 +573,7 @@ impl DeltaLayer {
///
struct DeltaLayerWriterInner {
conf: &'static PageServerConf,
pub path: Utf8PathBuf,
pub path: PathBuf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -710,7 +711,7 @@ impl DeltaLayerWriterInner {
ensure!(
metadata.len() <= S3_UPLOAD_LIMIT,
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
file.path,
file.path.display(),
metadata.len()
);
@@ -747,7 +748,7 @@ impl DeltaLayerWriterInner {
);
std::fs::rename(self.path, &final_path)?;
trace!("created delta layer {final_path}");
trace!("created delta layer {}", final_path.display());
Ok(layer)
}
@@ -846,13 +847,13 @@ impl Drop for DeltaLayerWriter {
impl DeltaLayerInner {
pub(super) async fn load(
path: &Utf8Path,
path: &std::path::Path,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{path}'"))?;
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
@@ -932,12 +933,15 @@ impl DeltaLayerInner {
.read_blob_into_buf(pos, &mut buf, ctx)
.await
.with_context(|| {
format!("Failed to read blob from virtual file {}", file.file.path)
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()
)
})?;
let val = Value::des(&buf).with_context(|| {
format!(
"Failed to deserialize file blob from virtual file {}",
file.file.path
file.file.path.display()
)
})?;
match val {

View File

@@ -37,7 +37,6 @@ use crate::virtual_file::VirtualFile;
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
@@ -46,6 +45,7 @@ use std::fs::{self, File};
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::path::{Path, PathBuf};
use tokio::sync::OnceCell;
use tracing::*;
@@ -195,7 +195,7 @@ impl AsLayerDesc for ImageLayer {
}
impl PersistentLayer for ImageLayer {
fn local_path(&self) -> Option<Utf8PathBuf> {
fn local_path(&self) -> Option<PathBuf> {
self.local_path()
}
@@ -269,10 +269,10 @@ impl ImageLayer {
.get_value_reconstruct_data(key, reconstruct_state, ctx)
.await
// FIXME: makes no sense to dump paths
.with_context(|| format!("read {}", self.path()))
.with_context(|| format!("read {}", self.path().display()))
}
pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
pub(crate) fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
@@ -304,7 +304,7 @@ impl ImageLayer {
timeline_id: TimelineId,
tenant_id: TenantId,
fname: &ImageFileName,
) -> Utf8PathBuf {
) -> PathBuf {
match path_or_conf {
PathOrConf::Path(path) => path.to_path_buf(),
PathOrConf::Conf(conf) => conf
@@ -318,7 +318,7 @@ impl ImageLayer {
timeline_id: TimelineId,
tenant_id: TenantId,
fname: &ImageFileName,
) -> Utf8PathBuf {
) -> PathBuf {
let rand_string: String = rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(8)
@@ -342,7 +342,7 @@ impl ImageLayer {
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.await
.with_context(|| format!("Failed to load image layer {}", self.path()))
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
@@ -359,7 +359,7 @@ impl ImageLayer {
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
let actual_filename = path.file_name().unwrap().to_owned();
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
@@ -399,8 +399,9 @@ impl ImageLayer {
/// Create an ImageLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
let mut summary_buf = vec![0; PAGE_SZ];
pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
file.read_exact_at(&mut summary_buf, 0)?;
let summary = Summary::des_prefix(&summary_buf)?;
let metadata = file
@@ -426,7 +427,7 @@ impl ImageLayer {
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> Utf8PathBuf {
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
self.desc.timeline_id,
@@ -438,14 +439,14 @@ impl ImageLayer {
impl ImageLayerInner {
pub(super) async fn load(
path: &Utf8Path,
path: &std::path::Path,
lsn: Lsn,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -525,7 +526,7 @@ impl ImageLayerInner {
///
struct ImageLayerWriterInner {
conf: &'static PageServerConf,
path: Utf8PathBuf,
path: PathBuf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_range: Range<Key>,
@@ -557,7 +558,7 @@ impl ImageLayerWriterInner {
lsn,
},
);
info!("new image layer {path}");
info!("new image layer {}", path.display());
let mut file = VirtualFile::open_with_options(
&path,
std::fs::OpenOptions::new().write(true).create_new(true),
@@ -684,7 +685,7 @@ impl ImageLayerWriterInner {
);
std::fs::rename(self.path, final_path)?;
trace!("created image layer {}", layer.path());
trace!("created image layer {}", layer.path().display());
Ok(layer)
}

View File

@@ -8,9 +8,9 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::timeline::layer_manager::LayerManager;
use anyhow::{bail, Result};
use camino::Utf8PathBuf;
use pageserver_api::models::HistoricLayerInfo;
use std::ops::Range;
use std::path::PathBuf;
use std::sync::Arc;
use utils::{
@@ -92,7 +92,7 @@ impl AsLayerDesc for RemoteLayer {
}
impl PersistentLayer for RemoteLayer {
fn local_path(&self) -> Option<Utf8PathBuf> {
fn local_path(&self) -> Option<PathBuf> {
None
}

View File

@@ -9,7 +9,6 @@ mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use fail::fail_point;
use futures::StreamExt;
use itertools::Itertools;
@@ -30,6 +29,7 @@ use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
use std::pin::pin;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
@@ -56,7 +55,7 @@ use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
UNEXPECTED_ONDEMAND_DOWNLOADS,
RECONSTRUCT_TIME, UNEXPECTED_ONDEMAND_DOWNLOADS,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
@@ -91,12 +90,12 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::debug_assert_current_span_has_tenant_and_timeline_id;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -144,12 +143,11 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option<RemoteTimelineClient>,
pub deletion_queue_client: DeletionQueueClient,
}
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
myself: Weak<Self>,
@@ -158,9 +156,6 @@ pub struct Timeline {
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
/// Never changes for the lifetime of this [`Timeline`] object.
///
/// This duplicates the generation stored in LocationConf, but that structure is mutable:
/// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
generation: Generation,
pub pg_version: u32,
@@ -499,39 +494,13 @@ impl Timeline {
};
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
let path = self
.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
let start = Instant::now();
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
let elapsed = start.elapsed();
crate::metrics::RECONSTRUCT_TIME
.for_result(&res)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing") && res.is_err() {
// it can only be walredo issue
use std::fmt::Write;
let mut msg = String::new();
path.into_iter().for_each(|(res, cont_lsn, layer)| {
writeln!(
msg,
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
layer(),
)
.expect("string grows")
});
// this is to rule out or provide evidence that we could in some cases read a duplicate
// walrecord
tracing::info!("walredo failed, path:\n{msg}");
}
res
RECONSTRUCT_TIME
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
.await
}
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -552,23 +521,9 @@ impl Timeline {
self.disk_consistent_lsn.load()
}
/// remote_consistent_lsn from the perspective of the tenant's current generation,
/// not validated with control plane yet.
/// See [`Self::get_remote_consistent_lsn_visible`].
pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_projected()
} else {
None
}
}
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
/// i.e. a value of remote_consistent_lsn_projected which has undergone
/// generation validation in the deletion queue.
pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_visible()
remote_client.last_uploaded_consistent_lsn()
} else {
None
}
@@ -588,7 +543,7 @@ impl Timeline {
}
pub fn resident_physical_size(&self) -> u64 {
self.metrics.resident_physical_size_get()
self.metrics.resident_physical_size_gauge.get()
}
///
@@ -1338,7 +1293,10 @@ impl Timeline {
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics.resident_physical_size_sub(layer_file_size);
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
self.metrics.evictions.inc();
if let Some(delta) = local_layer_residence_duration {
@@ -1381,42 +1339,42 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.checkpoint_distance
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.compaction_target_size
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
}
fn get_compaction_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.compaction_threshold
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
fn get_image_creation_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.image_creation_threshold
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
fn get_eviction_policy(&self) -> EvictionPolicy {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.eviction_policy
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1432,7 +1390,7 @@ impl Timeline {
}
fn get_gc_feedback(&self) -> bool {
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.gc_feedback
.unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1445,7 +1403,7 @@ impl Timeline {
// The threshold is embedded in the metric. So, we need to update it.
{
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
&self.tenant_conf.read().unwrap().tenant_conf,
&self.tenant_conf.read().unwrap(),
&self.conf.default_tenant_conf,
);
let tenant_id_str = self.tenant_id.to_string();
@@ -1464,7 +1422,7 @@ impl Timeline {
#[allow(clippy::too_many_arguments)]
pub(super) fn new(
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
metadata: &TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
@@ -1487,7 +1445,7 @@ impl Timeline {
let evictions_low_residence_duration_metric_threshold =
Self::get_evictions_low_residence_duration_metric_threshold(
&tenant_conf_guard.tenant_conf,
&tenant_conf_guard,
&conf.default_tenant_conf,
);
drop(tenant_conf_guard);
@@ -1652,15 +1610,12 @@ impl Timeline {
let tenant_conf_guard = self.tenant_conf.read().unwrap();
let wal_connect_timeout = tenant_conf_guard
.tenant_conf
.walreceiver_connect_timeout
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
let lagging_wal_timeout = tenant_conf_guard
.tenant_conf
.lagging_wal_timeout
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
let max_lsn_wal_lag = tenant_conf_guard
.tenant_conf
.max_lsn_wal_lag
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
drop(tenant_conf_guard);
@@ -1742,7 +1697,7 @@ impl Timeline {
Discovered::Temporary(name) => (name, "temporary timeline file"),
Discovered::TemporaryDownload(name) => (name, "temporary download"),
};
path.push(Utf8Path::new(&name));
path.push(name);
init::cleanup(&path, kind)?;
path.pop();
}
@@ -1865,7 +1820,7 @@ impl Timeline {
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_layer_file_deletion(needs_cleanup)?;
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -1875,7 +1830,9 @@ impl Timeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
self.metrics.resident_physical_size_set(total_physical_size);
self.metrics
.resident_physical_size_gauge
.set(total_physical_size);
timer.stop_and_record();
Ok(())
@@ -2223,10 +2180,10 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
let timeline_id = self.layer_desc().timeline_id;
match self.local_path() {
Some(local_path) => {
debug_assert!(local_path.to_string().contains(&format!("{}", timeline_id)),
debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
"need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
);
format!("{local_path}")
format!("{}", local_path.display())
}
None => {
format!("remote {}/{self}", timeline_id)
@@ -2256,7 +2213,7 @@ impl Timeline {
request_lsn: Lsn,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
) -> Result<(), PageReconstructError> {
// Start from the current timeline.
let mut timeline_owned;
let mut timeline = self;
@@ -2287,12 +2244,12 @@ impl Timeline {
// The function should have updated 'state'
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
match result {
ValueReconstructResult::Complete => return Ok(traversal_path),
ValueReconstructResult::Complete => return Ok(()),
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
return Ok(traversal_path);
return Ok(());
}
if prev_lsn <= cont_lsn {
// Didn't make any progress in last iteration. Error out to avoid
@@ -2363,7 +2320,7 @@ impl Timeline {
// during branch creation.
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
Err(state) if state == TimelineState::Stopping => {
return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
}
Err(state) => {
@@ -3728,11 +3685,6 @@ impl Timeline {
});
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
if !new_layers.is_empty() {
fail_point!("after-timeline-compacted-first-L1");
}
prev_key = Some(key);
}
if let Some(writer) = writer {
@@ -3754,7 +3706,7 @@ impl Timeline {
);
}
}
let mut layer_paths: Vec<Utf8PathBuf> = new_layers.iter().map(|l| l.path()).collect();
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
// Fsync all the layer files and directory using multiple threads to
// minimize latency.
@@ -3864,7 +3816,10 @@ impl Timeline {
let new_delta_path = l.path();
let metadata = new_delta_path.metadata().with_context(|| {
format!("read file metadata for new created layer {new_delta_path}")
format!(
"read file metadata for new created layer {}",
new_delta_path.display()
)
})?;
if let Some(remote_client) = &self.remote_client {
@@ -3887,7 +3842,6 @@ impl Timeline {
);
let l = l as Arc<dyn PersistentLayer>;
if guard.contains(&l) {
tracing::error!(layer=%l, "duplicated L1 layer");
duplicated_layers.insert(l.layer_desc().key());
} else {
if LayerMap::is_l0(l.layer_desc()) {
@@ -3921,7 +3875,7 @@ impl Timeline {
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
Ok(())
@@ -4256,7 +4210,7 @@ impl Timeline {
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
apply.flush();
@@ -4428,7 +4382,7 @@ impl Timeline {
// XXX the temp file is still around in Err() case
// and consumes space until we clean up upon pageserver restart.
self_clone.metrics.resident_physical_size_add(*size);
self_clone.metrics.resident_physical_size_gauge.add(*size);
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
@@ -4799,10 +4753,11 @@ fn is_send() {
/// Add a suffix to a layer file's name: .{num}.old
/// Uses the first available num (starts at 0)
fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
let filename = path
.file_name()
.ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
.ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))?
.to_string_lossy();
let mut new_path = path.to_owned();
for i in 0u32.. {

View File

@@ -14,7 +14,6 @@ use utils::{
use crate::{
config::PageServerConf,
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -408,7 +407,6 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -418,10 +416,7 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
TimelineResources {
remote_client,
deletion_queue_client,
},
TimelineResources { remote_client },
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -12,8 +12,7 @@ use crate::{
METADATA_FILE_NAME,
};
use anyhow::Context;
use camino::Utf8Path;
use std::{collections::HashMap, str::FromStr};
use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
use utils::lsn::Lsn;
/// Identified files in the timeline directory.
@@ -21,43 +20,46 @@ pub(super) enum Discovered {
/// The only one we care about
Layer(LayerFileName, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
Ephemeral(OsString),
/// Old temporary timeline files, unsure what these really are, should be removed
Temporary(String),
Temporary(OsString),
/// Temporary on-demand download files, should be removed
TemporaryDownload(String),
TemporaryDownload(OsString),
/// "metadata" file we persist locally and include in `index_part.json`
Metadata,
/// Backup file from previously future layers
IgnoredBackup,
/// Unrecognized, warn about these
Unknown(String),
Unknown(OsString),
}
/// Scans the timeline directory for interesting files.
pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovered>> {
pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
let mut ret = Vec::new();
for direntry in path.read_dir_utf8()? {
for direntry in std::fs::read_dir(path)? {
let direntry = direntry?;
let file_name = direntry.file_name().to_string();
let direntry_path = direntry.path();
let file_name = direntry.file_name();
let discovered = match LayerFileName::from_str(&file_name) {
let fname = file_name.to_string_lossy();
let discovered = match LayerFileName::from_str(&fname) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, file_size)
}
Err(_) => {
if file_name == METADATA_FILE_NAME {
if fname == METADATA_FILE_NAME {
Discovered::Metadata
} else if file_name.ends_with(".old") {
} else if fname.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
Discovered::TemporaryDownload(file_name)
} else if is_ephemeral_file(&file_name) {
} else if is_ephemeral_file(&fname) {
Discovered::Ephemeral(file_name)
} else if is_temporary(direntry.path()) {
} else if is_temporary(&direntry_path) {
Discovered::Temporary(file_name)
} else {
Discovered::Unknown(file_name)
@@ -160,14 +162,15 @@ pub(super) fn reconcile(
.collect::<Vec<_>>()
}
pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
let file_name = path.file_name().expect("must be file path");
tracing::debug!(kind, ?file_name, "cleaning up");
std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
std::fs::remove_file(path)
.with_context(|| format!("failed to remove {kind} at {}", path.display()))
}
pub(super) fn cleanup_local_file_for_remote(
path: &Utf8Path,
path: &Path,
local: &LayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
@@ -179,7 +182,8 @@ pub(super) fn cleanup_local_file_for_remote(
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
assert!(
path.exists(),
"we would leave the local_layer without a file if this does not hold: {path}",
"we would leave the local_layer without a file if this does not hold: {}",
path.display()
);
Err(err)
} else {
@@ -188,7 +192,7 @@ pub(super) fn cleanup_local_file_for_remote(
}
pub(super) fn cleanup_future_layer(
path: &Utf8Path,
path: &Path,
name: &LayerFileName,
disk_consistent_lsn: Lsn,
) -> anyhow::Result<()> {

View File

@@ -263,7 +263,7 @@ impl LayerManager {
let desc = layer.layer_desc();
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
metrics.resident_physical_size_sub(desc.file_size);
metrics.resident_physical_size_gauge.sub(desc.file_size);
}
// TODO Removing from the bottom of the layer map is expensive.

View File

@@ -1,7 +1,6 @@
use std::{collections::hash_map::Entry, fs, sync::Arc};
use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
use anyhow::Context;
use camino::Utf8PathBuf;
use tracing::{error, info, info_span, warn};
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
@@ -156,12 +155,12 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
#[must_use]
pub(crate) struct TimelineUninitMark {
uninit_mark_deleted: bool,
uninit_mark_path: Utf8PathBuf,
pub(crate) timeline_path: Utf8PathBuf,
uninit_mark_path: PathBuf,
pub(crate) timeline_path: PathBuf,
}
impl TimelineUninitMark {
pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
Self {
uninit_mark_deleted: false,
uninit_mark_path,
@@ -198,13 +197,14 @@ impl Drop for TimelineUninitMark {
if self.timeline_path.exists() {
error!(
"Uninit mark {} is not removed, timeline {} stays uninitialized",
self.uninit_mark_path, self.timeline_path
self.uninit_mark_path.display(),
self.timeline_path.display()
)
} else {
// unblock later timeline creation attempts
warn!(
"Removing intermediate uninit mark file {}",
self.uninit_mark_path
self.uninit_mark_path.display()
);
if let Err(e) = self.delete_mark_file_if_present() {
error!("Failed to remove the uninit mark file: {e}")

View File

@@ -370,9 +370,8 @@ pub(super) async fn handle_walreceiver_connection(
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let timeline_remote_consistent_lsn =
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
let last_received_lsn = last_lsn;

View File

@@ -1,3 +1,5 @@
use crate::metrics::RemoteOpFileKind;
use super::storage_layer::LayerFileName;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
@@ -9,7 +11,6 @@ use std::fmt::Debug;
use chrono::NaiveDateTime;
use std::sync::Arc;
use tracing::info;
use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
@@ -57,12 +58,7 @@ pub(crate) struct UploadQueueInitialized {
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
///
/// visible_remote_consistent_lsn is only updated after our generation has been validated with
/// the control plane (unlesss a timeline's generation is None, in which case
/// we skip validation)
pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
pub(crate) last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
@@ -85,14 +81,6 @@ impl UploadQueueInitialized {
pub(super) fn no_pending_work(&self) -> bool {
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
}
pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
self.visible_remote_consistent_lsn.load()
}
pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
self.projected_remote_consistent_lsn
}
}
#[derive(Clone, Copy)]
@@ -126,8 +114,9 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -169,10 +158,7 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
visible_remote_consistent_lsn: Arc::new(
index_part.metadata.disk_consistent_lsn().into(),
),
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -215,11 +201,12 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
/// A deletion of some layers within the lifetime of a timeline. This is not used
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerFileName, Generation)>,
pub(crate) file_kind: RemoteOpFileKind,
pub(crate) layer_file_name: LayerFileName,
pub(crate) scheduled_from_timeline_delete: bool,
pub(crate) generation: Generation,
}
#[derive(Debug)]
@@ -230,7 +217,7 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete layer files
/// Delete a layer file
Delete(Delete),
/// Barrier. When the barrier operation is reached,
@@ -252,9 +239,13 @@ impl std::fmt::Display for UploadOp {
UploadOp::UploadMetadata(_, lsn) => {
write!(f, "UploadMetadata(lsn: {})", lsn)
}
UploadOp::Delete(delete) => {
write!(f, "Delete({} layers)", delete.layers.len())
}
UploadOp::Delete(delete) => write!(
f,
"Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
delete.layer_file_name.file_name(),
delete.scheduled_from_timeline_delete,
delete.generation
),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}

View File

@@ -1,8 +1,8 @@
use bytes::Bytes;
use camino::Utf8PathBuf;
use std::{
fs::{create_dir_all, File},
io::{BufWriter, Write},
path::PathBuf,
};
pub struct Tracer {
@@ -16,7 +16,7 @@ impl Drop for Tracer {
}
impl Tracer {
pub fn new(path: Utf8PathBuf) -> Self {
pub fn new(path: PathBuf) -> Self {
let parent = path.parent().expect("failed to parse parent path");
create_dir_all(parent).expect("failed to create trace dir");

View File

@@ -12,11 +12,11 @@
//!
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
use crate::tenant::TENANTS_SEGMENT_NAME;
use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{RwLock, RwLockWriteGuard};
@@ -51,7 +51,7 @@ pub struct VirtualFile {
/// if a new file is created, we only pass the create flag when it's initially
/// opened, in the VirtualFile::create() function, and strip the flag before
/// storing it here.
pub path: Utf8PathBuf,
pub path: PathBuf,
open_options: OpenOptions,
// These are strings becase we only use them for metrics, and those expect strings.
@@ -177,19 +177,19 @@ impl OpenFiles {
pub enum CrashsafeOverwriteError {
#[error("final path has no parent dir")]
FinalPathHasNoParentDir,
#[error("remove tempfile")]
#[error("remove tempfile: {0}")]
RemovePreviousTempfile(#[source] std::io::Error),
#[error("create tempfile")]
#[error("create tempfile: {0}")]
CreateTempfile(#[source] std::io::Error),
#[error("write tempfile")]
#[error("write tempfile: {0}")]
WriteContents(#[source] std::io::Error),
#[error("sync tempfile")]
#[error("sync tempfile: {0}")]
SyncTempfile(#[source] std::io::Error),
#[error("rename tempfile to final path")]
#[error("rename tempfile to final path: {0}")]
RenameTempfileToFinalPath(#[source] std::io::Error),
#[error("open final path parent dir")]
#[error("open final path parent dir: {0}")]
OpenFinalPathParentDir(#[source] std::io::Error),
#[error("sync final path parent dir")]
#[error("sync final path parent dir: {0}")]
SyncFinalPathParentDir(#[source] std::io::Error),
}
impl CrashsafeOverwriteError {
@@ -210,13 +210,13 @@ impl CrashsafeOverwriteError {
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(path, OpenOptions::new().read(true)).await
}
/// Create a new file for writing. If the file exists, it will be truncated.
/// Like File::create.
pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
pub async fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(
path,
OpenOptions::new().write(true).create(true).truncate(true),
@@ -230,10 +230,10 @@ impl VirtualFile {
/// they will be applied also when the file is subsequently re-opened, not only
/// on the first time. Make sure that's sane!
pub async fn open_with_options(
path: &Utf8Path,
path: &Path,
open_options: &OpenOptions,
) -> Result<VirtualFile, std::io::Error> {
let path_str = path.to_string();
let path_str = path.to_string_lossy();
let parts = path_str.split('/').collect::<Vec<&str>>();
let tenant_id;
let timeline_id;
@@ -281,8 +281,8 @@ impl VirtualFile {
/// atomic, a crash during the write operation will never leave behind a
/// partially written file.
pub async fn crashsafe_overwrite(
final_path: &Utf8Path,
tmp_path: &Utf8Path,
final_path: &Path,
tmp_path: &Path,
content: &[u8],
) -> Result<(), CrashsafeOverwriteError> {
let Some(final_path_parent) = final_path.parent() else {
@@ -734,7 +734,7 @@ mod tests {
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
where
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
OF: Fn(PathBuf, OpenOptions) -> FT,
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
{
let testdir = crate::config::PageServerConf::test_repo_dir(testname);

View File

@@ -38,9 +38,6 @@ use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
#[cfg(feature = "testing")]
use std::sync::atomic::{AtomicUsize, Ordering};
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WAL_REDO_WAIT_TIME,
@@ -116,9 +113,6 @@ struct ProcessOutput {
pub struct PostgresRedoManager {
tenant_id: TenantId,
conf: &'static PageServerConf,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
stdout: Mutex<Option<ProcessOutput>>,
stdin: Mutex<Option<ProcessInput>>,
@@ -230,8 +224,6 @@ impl PostgresRedoManager {
PostgresRedoManager {
tenant_id,
conf,
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
stdin: Mutex::new(None),
stdout: Mutex::new(None),
stderr: Mutex::new(None),
@@ -298,27 +290,25 @@ impl PostgresRedoManager {
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
debug!(
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
// If something went wrong, don't try to reuse the process. Kill it, and
// next request will launch a new one.
if let Err(e) = result.as_ref() {
if result.is_err() {
error!(
n_attempts,
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}: {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn,
utils::error::report_compact_sources(e),
);
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn
);
// self.stdin only holds stdin & stderr as_raw_fd().
// Dropping it as part of take() doesn't close them.
// The owning objects (ChildStdout and ChildStderr) are stored in
@@ -335,8 +325,6 @@ impl PostgresRedoManager {
if let Some(proc) = self.stdin.lock().unwrap().take() {
proc.child.kill_and_wait();
}
} else if n_attempts != 0 {
info!(n_attempts, "retried walredo succeeded");
}
n_attempts += 1;
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
@@ -754,7 +742,7 @@ impl PostgresRedoManager {
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
fn apply_wal_records(
&self,
input: MutexGuard<Option<ProcessInput>>,
mut input: MutexGuard<Option<ProcessInput>>,
tag: BufferTag,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
@@ -791,23 +779,6 @@ impl PostgresRedoManager {
build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
mut input: MutexGuard<Option<ProcessInput>>,
wal_redo_timeout: Duration,
) -> Result<Bytes, std::io::Error> {
let proc = input.as_mut().unwrap();
let mut nwrite = 0usize;
let stdout_fd = proc.stdout_fd;
@@ -825,7 +796,7 @@ impl PostgresRedoManager {
while nwrite < writebuf.len() {
let n = loop {
match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
Err(e) if e == nix::errno::Errno::EINTR => continue,
res => break res,
}
}?;
@@ -917,7 +888,7 @@ impl PostgresRedoManager {
// and forward any logging information that the child writes to its stderr to the page server's log.
let n = loop {
match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
Err(e) if e == nix::errno::Errno::EINTR => continue,
res => break res,
}
}?;
@@ -1013,38 +984,6 @@ impl PostgresRedoManager {
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
/// Wrapper type around `std::process::Child` which guarantees that the child
@@ -1278,13 +1217,13 @@ mod tests {
struct RedoHarness {
// underscored because unused, except for removal at drop
_repo_dir: camino_tempfile::Utf8TempDir,
_repo_dir: tempfile::TempDir,
manager: PostgresRedoManager,
}
impl RedoHarness {
fn new() -> anyhow::Result<Self> {
let repo_dir = camino_tempfile::tempdir()?;
let repo_dir = tempfile::tempdir()?;
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_id = TenantId::generate();

View File

@@ -7,12 +7,12 @@ OBJS = \
extension_server.o \
file_cache.o \
libpagestore.o \
libpqwalproposer.o \
neon.o \
neon_utils.o \
pagestore_smgr.o \
relsize_cache.o \
walproposer.o \
walproposer_pg.o \
walproposer_utils.o \
control_plane_connector.o
PG_CPPFLAGS = -I$(libpq_srcdir)

View File

@@ -741,13 +741,6 @@ NeonProcessUtility(
break;
case T_DropdbStmt:
HandleDropDb(castNode(DropdbStmt, parseTree));
/*
* We do this here to hack around the fact that Postgres performs the drop
* INSIDE of standard_ProcessUtility, which means that if we try to
* abort the drop normally it'll be too late. DROP DATABASE can't be inside
* of a transaction block anyway, so this should be fine to do.
*/
NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
break;
case T_CreateRoleStmt:
HandleCreateRole(castNode(CreateRoleStmt, parseTree));

View File

@@ -14,6 +14,7 @@
*/
#include <sys/file.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include <fcntl.h>
@@ -37,6 +38,9 @@
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/buf_internals.h"
#include "storage/procsignal.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
@@ -62,6 +66,9 @@
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
typedef struct FileCacheEntry
{
BufferTag key;
@@ -84,12 +91,14 @@ static int lfc_desc = 0;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_free_space_watermark;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
void FileCacheMonitorMain(Datum main_arg);
@@ -245,6 +254,80 @@ lfc_change_limit_hook(int newval, void *extra)
LWLockRelease(lfc_lock);
}
/*
* Local file system state monitor check available free space.
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
* but throwing away least recently accessed chunks.
* First time low space watermark is reached cache size is divided by two,
* second time by four,... Finally we remove all chunks from local cache.
*
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
* We only throw away cached chunks but do not prevent from filling cache by new chunks.
*
* Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
* disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
* Calling statvfs each second should not add any noticeable overhead.
*/
void
FileCacheMonitorMain(Datum main_arg)
{
/*
* Choose file system state monitor interval so that space can not be exosted
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
*/
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
BackgroundWorkerUnblockSignals();
/* Periodically dump buffers until terminated. */
while (!ShutdownRequestPending)
{
if (lfc_size_limit != 0)
{
struct statvfs sfs;
if (statvfs(lfc_path, &sfs) < 0)
{
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
}
else
{
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
{
if (lfc_shrinking_factor < 31) {
lfc_shrinking_factor += 1;
}
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
}
else
lfc_shrinking_factor = 0; /* reset to initial value */
}
}
pg_usleep(monitor_interval);
}
}
static void
lfc_register_free_space_monitor(void)
{
BackgroundWorker bgw;
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
void
lfc_init(void)
{
@@ -281,6 +364,19 @@ lfc_init(void)
lfc_change_limit_hook,
NULL);
DefineCustomIntVariable("neon.free_space_watermark",
"Minimal free space in local file system after reaching which local file cache will be truncated",
NULL,
&lfc_free_space_watermark,
1024, /* 1GB */
0,
INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MB,
NULL,
NULL,
NULL);
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
@@ -295,6 +391,9 @@ lfc_init(void)
if (lfc_max_size == 0)
return;
if (lfc_free_space_watermark != 0)
lfc_register_free_space_monitor();
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = lfc_shmem_startup;
#if PG_VERSION_NUM>=150000

View File

@@ -30,7 +30,7 @@
#include "neon.h"
#include "walproposer.h"
#include "neon_utils.h"
#include "walproposer_utils.h"
#define PageStoreTrace DEBUG5

View File

@@ -0,0 +1,424 @@
#include "postgres.h"
#include "libpq-fe.h"
#include "neon.h"
#include "walproposer.h"
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
struct WalProposerConn
{
PGconn *pg_conn;
bool is_nonblocking; /* whether the connection is non-blocking */
char *recvbuf; /* last received data from
* walprop_async_read */
};
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
{
/* If we're already correctly blocking or nonblocking, all good */
if (is_nonblocking == conn->is_nonblocking)
return true;
/* Otherwise, set it appropriately */
if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
return false;
conn->is_nonblocking = is_nonblocking;
return true;
}
/* Exported function definitions */
char *
walprop_error_message(WalProposerConn *conn)
{
return PQerrorMessage(conn->pg_conn);
}
WalProposerConnStatusType
walprop_status(WalProposerConn *conn)
{
switch (PQstatus(conn->pg_conn))
{
case CONNECTION_OK:
return WP_CONNECTION_OK;
case CONNECTION_BAD:
return WP_CONNECTION_BAD;
default:
return WP_CONNECTION_IN_PROGRESS;
}
}
WalProposerConn *
walprop_connect_start(char *conninfo, char *password)
{
WalProposerConn *conn;
PGconn *pg_conn;
const char *keywords[3];
const char *values[3];
int n;
/*
* Connect using the given connection string. If the
* NEON_AUTH_TOKEN environment variable was set, use that as
* the password.
*
* The connection options are parsed in the order they're given, so
* when we set the password before the connection string, the
* connection string can override the password from the env variable.
* Seems useful, although we don't currently use that capability
* anywhere.
*/
n = 0;
if (password)
{
keywords[n] = "password";
values[n] = password;
n++;
}
keywords[n] = "dbname";
values[n] = conninfo;
n++;
keywords[n] = NULL;
values[n] = NULL;
n++;
pg_conn = PQconnectStartParams(keywords, values, 1);
/*
* Allocation of a PQconn can fail, and will return NULL. We want to fully
* replicate the behavior of PQconnectStart here.
*/
if (!pg_conn)
return NULL;
/*
* And in theory this allocation can fail as well, but it's incredibly
* unlikely if we just successfully allocated a PGconn.
*
* palloc will exit on failure though, so there's not much we could do if
* it *did* fail.
*/
conn = palloc(sizeof(WalProposerConn));
conn->pg_conn = pg_conn;
conn->is_nonblocking = false; /* connections always start in blocking
* mode */
conn->recvbuf = NULL;
return conn;
}
WalProposerConnectPollStatusType
walprop_connect_poll(WalProposerConn *conn)
{
WalProposerConnectPollStatusType return_val;
switch (PQconnectPoll(conn->pg_conn))
{
case PGRES_POLLING_FAILED:
return_val = WP_CONN_POLLING_FAILED;
break;
case PGRES_POLLING_READING:
return_val = WP_CONN_POLLING_READING;
break;
case PGRES_POLLING_WRITING:
return_val = WP_CONN_POLLING_WRITING;
break;
case PGRES_POLLING_OK:
return_val = WP_CONN_POLLING_OK;
break;
/*
* There's a comment at its source about this constant being
* unused. We'll expect it's never returned.
*/
case PGRES_POLLING_ACTIVE:
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
/*
* This return is never actually reached, but it's here to make
* the compiler happy
*/
return WP_CONN_POLLING_FAILED;
default:
Assert(false);
return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
}
return return_val;
}
bool
walprop_send_query(WalProposerConn *conn, char *query)
{
/*
* We need to be in blocking mode for sending the query to run without
* requiring a call to PQflush
*/
if (!ensure_nonblocking_status(conn, false))
return false;
/* PQsendQuery returns 1 on success, 0 on failure */
if (!PQsendQuery(conn->pg_conn, query))
return false;
return true;
}
WalProposerExecStatusType
walprop_get_query_result(WalProposerConn *conn)
{
PGresult *result;
WalProposerExecStatusType return_val;
/* Marker variable if we need to log an unexpected success result */
char *unexpected_success = NULL;
/* Consume any input that we might be missing */
if (!PQconsumeInput(conn->pg_conn))
return WP_EXEC_FAILED;
if (PQisBusy(conn->pg_conn))
return WP_EXEC_NEEDS_INPUT;
result = PQgetResult(conn->pg_conn);
/*
* PQgetResult returns NULL only if getting the result was successful &
* there's no more of the result to get.
*/
if (!result)
{
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
return WP_EXEC_UNEXPECTED_SUCCESS;
}
/* Helper macro to reduce boilerplate */
#define UNEXPECTED_SUCCESS(msg) \
return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
unexpected_success = msg; \
break;
switch (PQresultStatus(result))
{
/* "true" success case */
case PGRES_COPY_BOTH:
return_val = WP_EXEC_SUCCESS_COPYBOTH;
break;
/* Unexpected success case */
case PGRES_EMPTY_QUERY:
UNEXPECTED_SUCCESS("empty query return");
case PGRES_COMMAND_OK:
UNEXPECTED_SUCCESS("data-less command end");
case PGRES_TUPLES_OK:
UNEXPECTED_SUCCESS("tuples return");
case PGRES_COPY_OUT:
UNEXPECTED_SUCCESS("'Copy Out' response");
case PGRES_COPY_IN:
UNEXPECTED_SUCCESS("'Copy In' response");
case PGRES_SINGLE_TUPLE:
UNEXPECTED_SUCCESS("single tuple return");
case PGRES_PIPELINE_SYNC:
UNEXPECTED_SUCCESS("pipeline sync point");
/* Failure cases */
case PGRES_BAD_RESPONSE:
case PGRES_NONFATAL_ERROR:
case PGRES_FATAL_ERROR:
case PGRES_PIPELINE_ABORTED:
return_val = WP_EXEC_FAILED;
break;
default:
Assert(false);
return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
}
if (unexpected_success)
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
return return_val;
}
pgsocket
walprop_socket(WalProposerConn *conn)
{
return PQsocket(conn->pg_conn);
}
int
walprop_flush(WalProposerConn *conn)
{
return (PQflush(conn->pg_conn));
}
void
walprop_finish(WalProposerConn *conn)
{
if (conn->recvbuf != NULL)
PQfreemem(conn->recvbuf);
PQfinish(conn->pg_conn);
pfree(conn);
}
/*
* Receive a message from the safekeeper.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*/
PGAsyncReadResult
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
{
int result;
if (conn->recvbuf != NULL)
{
PQfreemem(conn->recvbuf);
conn->recvbuf = NULL;
}
/* Call PQconsumeInput so that we have the data we need */
if (!PQconsumeInput(conn->pg_conn))
{
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
}
/*
* The docs for PQgetCopyData list the return values as: 0 if the copy is
* still in progress, but no "complete row" is available -1 if the copy is
* done -2 if an error occurred (> 0) if it was successful; that value is
* the amount transferred.
*
* The protocol we use between walproposer and safekeeper means that we
* *usually* wouldn't expect to see that the copy is done, but this can
* sometimes be triggered by the server returning an ErrorResponse (which
* also happens to have the effect that the copy is done).
*/
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
{
case 0:
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_TRY_AGAIN;
case -1:
{
/*
* If we get -1, it's probably because of a server error; the
* safekeeper won't normally send a CopyDone message.
*
* We can check PQgetResult to make sure that the server
* failed; it'll always result in PGRES_FATAL_ERROR
*/
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
if (status != PGRES_FATAL_ERROR)
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
/*
* If there was actually an error, it'll be properly reported
* by calls to PQerrorMessage -- we don't have to do anything
* else
*/
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
}
case -2:
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
default:
/* Positive values indicate the size of the returned result */
*amount = result;
*buf = conn->recvbuf;
return PG_ASYNC_READ_SUCCESS;
}
}
PGAsyncWriteResult
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
{
int result;
/* If we aren't in non-blocking mode, switch to it. */
if (!ensure_nonblocking_status(conn, true))
return PG_ASYNC_WRITE_FAIL;
/*
* The docs for PQputcopyData list the return values as: 1 if the data was
* queued, 0 if it was not queued because of full buffers, or -1 if an
* error occurred
*/
result = PQputCopyData(conn->pg_conn, buf, size);
/*
* We won't get a result of zero because walproposer always empties the
* connection's buffers before sending more
*/
Assert(result != 0);
switch (result)
{
case 1:
/* good -- continue */
break;
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQputCopyData", result);
}
/*
* After queueing the data, we still need to flush to get it to send. This
* might take multiple tries, but we don't want to wait around until it's
* done.
*
* PQflush has the following returns (directly quoting the docs): 0 if
* sucessful, 1 if it was unable to send all the data in the send queue
* yet -1 if it failed for some reason
*/
switch (result = PQflush(conn->pg_conn))
{
case 0:
return PG_ASYNC_WRITE_SUCCESS;
case 1:
return PG_ASYNC_WRITE_TRY_FLUSH;
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQflush", result);
}
}
/*
* This function is very similar to walprop_async_write. For more
* information, refer to the comments there.
*/
bool
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
{
int result;
/* If we are in non-blocking mode, switch out of it. */
if (!ensure_nonblocking_status(conn, false))
return false;
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
return false;
Assert(result == 1);
/* Because the connection is non-blocking, flushing returns 0 or -1 */
if ((result = PQflush(conn->pg_conn)) == -1)
return false;
Assert(result == 0);
return true;
}

Some files were not shown because too many files have changed in this diff Show More