Compare commits

..

7 Commits

Author SHA1 Message Date
Arseny Sher
32d4e4914a Add wait events without query to metric. 2023-11-16 23:56:04 +01:00
Arseny Sher
d4d577e7ff Add query to pg_wait_sampling metric 2023-11-16 22:42:08 +01:00
Arseny Sher
f552aa05fa Add pg_wait_sampling metric for vms. 2023-11-16 22:04:29 +01:00
Arthur Petukhovsky
779badb7c5 Join postgres multiline logs 2023-11-16 20:54:02 +00:00
Arseny Sher
e6eb548491 create extension pg_wait_sampling in compute_ctl 2023-11-16 20:54:02 +00:00
Arseny Sher
16e9eb2832 Try to enable a custom postgres_exporter query. 2023-11-16 20:54:02 +00:00
Arseny Sher
042686183b Add pg_wait_sampling extension. 2023-11-16 20:54:02 +00:00
272 changed files with 8141 additions and 17414 deletions

View File

@@ -1,3 +1,17 @@
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
#
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
#
[profile.dev.package."*"]
# Set the default for dependencies in Development mode.
opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1
[build]
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.

View File

@@ -199,10 +199,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3
@@ -408,7 +404,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, tag ]
needs: [ check-permissions, build-neon ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -440,7 +436,6 @@ jobs:
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -1101,10 +1096,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3

View File

@@ -142,10 +142,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
@@ -242,20 +238,6 @@ jobs:
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:

View File

@@ -2,7 +2,7 @@ name: Create Release Branch
on:
schedule:
- cron: '0 6 * * 1'
- cron: '0 7 * * 5'
workflow_dispatch:
jobs:

3
.gitignore vendored
View File

@@ -18,6 +18,3 @@ test_output/
*.o
*.so
*.Po
# pgindent typedef lists
*.list

View File

@@ -9,24 +9,6 @@ refactoring, additional comments, and so forth. Let's try to raise the
bar, and clean things up as we go. Try to leave code in a better shape
than it was before.
## Pre-commit hook
We have a sample pre-commit hook in `pre-commit.py`.
To set it up, run:
```bash
ln -s ../../pre-commit.py .git/hooks/pre-commit
```
This will run following checks on staged files before each commit:
- `rustfmt`
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
If you want to skip the hook, run `git commit` with `--no-verify` option.
## Submitting changes
1. Get at least one +1 on your PR before you push.

775
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,6 @@ members = [
"control_plane",
"pageserver",
"pageserver/ctl",
"pageserver/client",
"proxy",
"safekeeper",
"storage_broker",
@@ -38,19 +37,20 @@ license = "Apache-2.0"
[workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
azure_storage_blobs = "0.18"
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
azure_core = "0.16"
azure_identity = "0.16"
azure_storage = "0.16"
azure_storage_blobs = "0.16"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.0"
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.0"
aws-credential-types = "1.0"
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56"
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
aws-credential-types = "0.56"
aws-types = "0.56"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -89,7 +89,6 @@ humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "8"
libc = "0.2"
@@ -110,7 +109,7 @@ pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
regex = "1.10.2"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
@@ -123,17 +122,14 @@ rustls-pemfile = "1"
rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_path_to_error = "0.1"
serde_with = "2.0"
serde_assert = "0.5.0"
sha2 = "0.10.2"
signal-hook = "0.3"
smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
@@ -150,7 +146,7 @@ tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
tokio-util = { version = "0.7", features = ["io"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
@@ -169,11 +165,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +179,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
@@ -211,7 +206,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
################# Binary contents sections

View File

@@ -393,9 +393,7 @@ RUN case "${PG_VERSION}" in \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
;; \
*) \
export TIMESCALEDB_VERSION=2.13.0 \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
esac && \
apt-get update && \
apt-get install -y cmake && \
@@ -718,20 +716,20 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
#########################################################################################
#
# Layer "wal2json-build"
# Compile "wal2json" extension
# Layer "pg-wait-sampling-pg-build"
# compile pg_wait_sampling extension
#
#########################################################################################
FROM build-deps AS wal2json-pg-build
FROM build-deps AS pg-wait-sampling-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
RUN wget https://github.com/postgrespro/pg_wait_sampling/archive/refs/tags/v1.1.5.tar.gz -O pg_wait_sampling.tar.gz && \
echo 'a03da6a413f5652ce470a3635ed6ebba528c74cb26aa4cfced8aff8a8441f81ec6dd657ff62cd6ce96a4e6ce02cad9f2519ae9525367ece60497aa20faafde5c pg_wait_sampling.tar.gz' | sha512sum -c && \
mkdir pg_wait_sampling-src && cd pg_wait_sampling-src && tar xvzf ../pg_wait_sampling.tar.gz --strip-components=1 -C . && \
make USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) && \
make USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_wait_sampling.control
#########################################################################################
#
@@ -769,7 +767,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-wait-sampling-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \

View File

@@ -260,44 +260,6 @@ distclean:
fmt:
./pre-commit.py --fix-inplace
postgres-%-pg-bsd-indent: postgres-%
+@echo "Compiling pg_bsd_indent"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
# Create typedef list for the core. Note that generally it should be combined with
# buildfarm one to cover platform specific stuff.
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
postgres-%-typedefs.list: postgres-%
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
# Indent postgres. See src/tools/pgindent/README for details.
.PHONY: postgres-%-pgindent
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+@echo merge with buildfarm typedef to cover all platforms
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
REL_16_STABLE list misses PGSemaphoreData
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+@echo note: you might want to run it on selected files/dirs instead.
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
rm -f pg*.BAK
# Indent pxgn/neon.
.PHONY: pgindent
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
.PHONY: setup-pre-commit-hook
setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit

View File

@@ -149,9 +149,6 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# create postgres compute node
> cargo neon endpoint create main
# start postgres compute node
> cargo neon endpoint start main
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -188,11 +185,8 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
(L) main [de200bd42b49cc1814412c7e592dd6e9]
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
# create postgres on that branch
> cargo neon endpoint create migration_check --branch-name migration_check
# start postgres on that branch
> cargo neon endpoint start migration_check
> cargo neon endpoint start migration_check --branch-name migration_check
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

View File

@@ -38,4 +38,3 @@ toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.12.4"
bytes = "1.0"

View File

@@ -31,7 +31,7 @@
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
//! ```
//!
use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
const BUILD_TAG_DEFAULT: &str = "latest";
const BUILD_TAG_DEFAULT: &str = "5670669815";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,18 +74,10 @@ fn main() -> Result<()> {
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
let ext_remote_storage = remote_ext_config.map(|x| {
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
});
let http_port = *matches
.get_one::<u16>("http-port")
@@ -206,7 +198,7 @@ fn main() -> Result<()> {
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
ext_remote_storage,
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
@@ -274,13 +266,7 @@ fn main() -> Result<()> {
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead
// of timeout.
compute.state_changed.notify_all();
drop(state); // unlock
drop(state);
delay_exit = true;
None
}
@@ -493,6 +479,13 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
// DEPRECATED, NO LONGER DOES ANYTHING.
// See https://github.com/neondatabase/cloud/issues/7516
Arg::new("file-cache-on-disk")
.long("file-cache-on-disk")
.action(clap::ArgAction::SetTrue),
)
}
#[test]

View File

@@ -2,6 +2,7 @@ use std::collections::HashMap;
use std::env;
use std::fs;
use std::io::BufRead;
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
@@ -14,6 +15,7 @@ use chrono::{DateTime, Utc};
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use notify::event;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
@@ -22,10 +24,10 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, RemotePath};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use crate::checker::create_availability_check_data;
use crate::pg_helpers::*;
@@ -59,8 +61,8 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// the address of extension storage proxy gateway
pub ext_remote_storage: Option<String>,
/// the S3 bucket that we search for extensions in
pub ext_remote_storage: Option<GenericRemoteStorage>,
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
@@ -252,7 +254,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -277,17 +279,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
}
impl ComputeNode {
/// Check that compute node has corresponding feature enabled.
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
let state = self.state.lock().unwrap();
if let Some(s) = state.pspec.as_ref() {
s.spec.features.contains(&feature)
} else {
false
}
}
pub fn set_status(&self, status: ComputeStatus) {
let mut state = self.state.lock().unwrap();
state.status = status;
@@ -655,9 +646,30 @@ impl ComputeNode {
} else {
vec![]
})
.stderr(Stdio::piped())
.spawn()
.expect("cannot start postgres process");
let stderr = pg.stderr.take().unwrap();
std::thread::spawn(move || {
let reader = std::io::BufReader::new(stderr);
let mut last_lines = vec![];
for line in reader.lines() {
if let Ok(line) = line {
if line.starts_with("2023-") {
// print all lines from the previous postgres instance
let combined = format!("PG:{}\n", last_lines.join("\u{200B}"));
let res = std::io::stderr().lock().write_all(combined.as_bytes());
if let Err(e) = res {
error!("failed to write to stderr: {}", e);
}
last_lines.clear();
}
last_lines.push(line);
}
}
});
wait_for_postgres(&mut pg, pgdata_path)?;
Ok(pg)
@@ -709,7 +721,6 @@ impl ComputeNode {
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, &mut client, self.connstr.as_str())?;
handle_extensions(spec, &mut client)?;
handle_extension_neon(&mut client)?;
create_availability_check_data(&mut client)?;
// 'Close' connection
@@ -739,12 +750,7 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
self.pg_reload_conf()?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -759,16 +765,11 @@ impl ComputeNode {
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
handle_grants(&spec, &mut client, self.connstr.as_str())?;
handle_extensions(&spec, &mut client)?;
handle_extension_neon(&mut client)?;
}
// 'Close' connection
drop(client);
// reset max_cluster_size in config back to original value and reload config
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
let unknown_op = "unknown".to_string();
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
info!(
@@ -829,17 +830,7 @@ impl ComputeNode {
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
}
let startup_end_time = Utc::now();
@@ -987,12 +978,12 @@ LIMIT 100",
real_ext_name: String,
ext_path: RemotePath,
) -> Result<u64, DownloadError> {
let ext_remote_storage =
self.ext_remote_storage
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured",
)))?;
let remote_storage = self
.ext_remote_storage
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured",
)))?;
let ext_archive_name = ext_path.object_name().expect("bad path");
@@ -1048,7 +1039,7 @@ LIMIT 100",
let download_size = extension_server::download_extension(
&real_ext_name,
&ext_path,
ext_remote_storage,
remote_storage,
&self.pgbin,
)
.await

View File

@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
writeln!(file, "neon.extension_server_port={}", port)?;
}
// This is essential to keep this line at the end of the file,
// because it is intended to override any settings above.
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
Ok(())
}
/// create file compute_ctl_temp_override.conf in pgdata_dir
/// add provided options to this file
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
let path = pgdata_path.join("compute_ctl_temp_override.conf");
let mut file = File::create(path)?;
write!(file, "{}", options)?;
Ok(())
}
/// remove file compute_ctl_temp_override.conf in pgdata_dir
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
let path = pgdata_path.join("compute_ctl_temp_override.conf");
std::fs::remove_file(path)?;
Ok(())
}

View File

@@ -71,16 +71,18 @@ More specifically, here is an example ext_index.json
}
}
*/
use anyhow::Context;
use anyhow::{self, Result};
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;
use regex::Regex;
use remote_storage::*;
use reqwest::StatusCode;
use serde_json;
use std::io::Read;
use std::num::NonZeroUsize;
use std::path::Path;
use std::str;
use tar::Archive;
use tokio::io::AsyncReadExt;
use tracing::info;
use tracing::log::warn;
use zstd::stream::read::Decoder;
@@ -136,31 +138,23 @@ fn parse_pg_version(human_version: &str) -> &str {
pub async fn download_extension(
ext_name: &str,
ext_path: &RemotePath,
ext_remote_storage: &str,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
) -> Result<u64> {
info!("Download extension {:?} from {:?}", ext_name, ext_path);
// TODO add retry logic
let download_buffer =
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
Ok(buffer) => buffer,
Err(error_message) => {
return Err(anyhow::anyhow!(
"error downloading extension {:?}: {:?}",
ext_name,
error_message
));
}
};
let mut download = remote_storage.download(ext_path).await?;
let mut download_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut download_buffer)
.await?;
let download_size = download_buffer.len() as u64;
info!("Download size {:?}", download_size);
// it's unclear whether it is more performant to decompress into memory or not
// TODO: decompressing into memory can be avoided
let decoder = Decoder::new(download_buffer.as_ref())?;
let mut archive = Archive::new(decoder);
let mut decoder = Decoder::new(download_buffer.as_slice())?;
let mut decompress_buffer = Vec::new();
decoder.read_to_end(&mut decompress_buffer)?;
let mut archive = Archive::new(decompress_buffer.as_slice());
let unzip_dest = pgbin
.strip_suffix("/bin/postgres")
.expect("bad pgbin")
@@ -228,32 +222,29 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, i.e.
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HHTP GET
// and return the response body as bytes
//
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
info!("Download extension {:?} from uri {:?}", ext_path, uri);
let resp = reqwest::get(uri).await?;
match resp.status() {
StatusCode::OK => match resp.bytes().await {
Ok(resp) => {
info!("Download extension {:?} completed successfully", ext_path);
Ok(resp)
}
Err(e) => bail!("could not deserialize remote extension response: {}", e),
},
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
_ => bail!(
"unexpected remote extension response status code: {}",
resp.status()
),
// This function initializes the necessary structs to use remote storage
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
#[derive(Debug, serde::Deserialize)]
struct RemoteExtJson {
bucket: String,
region: String,
endpoint: Option<String>,
prefix: Option<String>,
}
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
let config = S3Config {
bucket_name: remote_ext_json.bucket,
bucket_region: remote_ext_json.region,
prefix_in_bucket: remote_ext_json.prefix,
endpoint: remote_ext_json.endpoint,
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}
#[cfg(test)]

View File

@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from remote extension storage on demand
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
@@ -227,7 +227,7 @@ async fn handle_configure_request(
let parsed_spec = match ParsedSpec::try_from(spec) {
Ok(ps) => ps,
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
};
// XXX: wrap state update under lock in code blocks. Otherwise,

View File

@@ -156,17 +156,17 @@ paths:
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
/// Build a list of existing Postgres roles
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
let postgres_roles = xact
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
.query(
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
&[],
)?
.iter()
.map(|row| Role {
name: row.get("rolname"),
encrypted_password: row.get("rolpassword"),
replication: Some(row.get("rolreplication")),
bypassrls: Some(row.get("rolbypassrls")),
options: None,
})
.collect();

View File

@@ -118,6 +118,19 @@ pub fn get_spec_from_control_plane(
spec
}
/// It takes cluster specification and does the following:
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
/// - Update `pg_hba.conf` to allow external connections.
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
update_pg_hba(pgdata_path)?;
Ok(())
}
/// Check `pg_hba.conf` and update if needed to allow external connections.
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of spec.json
@@ -252,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let action = if let Some(r) = pg_role {
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|| !r.bypassrls.unwrap_or(false)
|| !r.replication.unwrap_or(false)
{
RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password {
@@ -283,22 +298,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
match action {
RoleAction::None => {}
RoleAction::Update => {
// This can be run on /every/ role! Not just ones created through the console.
// This means that if you add some funny ALTER here that adds a permission,
// this will get run even on user-created roles! This will result in different
// behavior before and after a spec gets reapplied. The below ALTER as it stands
// now only grants LOGIN and changes the password. Please do not allow this branch
// to do anything silly.
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
let mut query: String =
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
}
RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser.
let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
name.pg_quote()
);
info!("role create query: '{}'", &query);
@@ -663,37 +670,13 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
info!("creating system extensions with query: {}", query);
client.simple_query(query)?;
}
if libs.contains("pg_wait_sampling") {
// Create extension only if this compute really needs it
let query = "CREATE EXTENSION IF NOT EXISTS pg_wait_sampling";
info!("creating system extensions with query: {}", query);
client.simple_query(query)?;
}
}
Ok(())
}
/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
#[instrument(skip_all)]
pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
info!("handle extension neon");
let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
client.simple_query(query)?;
query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
info!("create neon extension with query: {}", query);
client.simple_query(query)?;
query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
client.simple_query(query)?;
query = "ALTER EXTENSION neon SET SCHEMA neon";
info!("alter neon extension schema with query: {}", query);
client.simple_query(query)?;
// this will be a no-op if extension is already up to date,
// which may happen in two cases:
// - extension was just installed
// - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension schema with query: {}", query);
client.simple_query(query)?;
Ok(())
}

View File

@@ -6,11 +6,9 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
git-version.workspace = true
nix.workspace = true
once_cell.workspace = true
@@ -26,11 +24,10 @@ tar.workspace = true
thiserror.workspace = true
toml.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
tokio-util.workspace = true
url.workspace = true
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
# instead, so that recompile times are better.
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_backend.workspace = true
safekeeper_api.workspace = true
postgres_connection.workspace = true

View File

@@ -9,7 +9,7 @@ pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: PathBuf,
client: reqwest::Client,
client: reqwest::blocking::Client,
}
const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
env: env.clone(),
path,
listen,
client: reqwest::ClientBuilder::new()
client: reqwest::blocking::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
@@ -64,7 +64,7 @@ impl AttachmentService {
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<Child> {
pub fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
background_process::start_process(
@@ -73,11 +73,10 @@ impl AttachmentService {
&self.env.attachment_service_bin(),
["-l", &self.listen, "-p", &path_str],
[],
background_process::InitialPidFile::Create(self.pid_file()),
background_process::InitialPidFile::Create(&self.pid_file()),
// TODO: a real status check
|| async move { anyhow::Ok(true) },
|| Ok(true),
)
.await
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -85,7 +84,7 @@ impl AttachmentService {
}
/// Call into the attach_hook API, for use before handing out attachments to pageservers
pub async fn attach_hook(
pub fn attach_hook(
&self,
tenant_id: TenantId,
pageserver_id: NodeId,
@@ -105,16 +104,16 @@ impl AttachmentService {
node_id: Some(pageserver_id),
};
let response = self.client.post(url).json(&request).send().await?;
let response = self.client.post(url).json(&request).send()?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
}
let response = response.json::<AttachHookResponse>().await?;
let response = response.json::<AttachHookResponse>()?;
Ok(response.gen)
}
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
use hyper::StatusCode;
let url = self
@@ -127,12 +126,12 @@ impl AttachmentService {
let request = InspectRequest { tenant_id };
let response = self.client.post(url).json(&request).send().await?;
let response = self.client.post(url).json(&request).send()?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
}
let response = response.json::<InspectResponse>().await?;
let response = response.json::<InspectResponse>()?;
Ok(response.attachment)
}
}

View File

@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
pub enum InitialPidFile {
pub enum InitialPidFile<'t> {
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
Create(Utf8PathBuf),
Create(&'t Utf8Path),
/// The process will create the pidfile itself, need to wait for that event.
Expect(Utf8PathBuf),
Expect(&'t Utf8Path),
}
/// Start a background child process using the parameters given.
pub async fn start_process<F, Fut, AI, A, EI>(
pub fn start_process<F, AI, A, EI>(
process_name: &str,
datadir: &Path,
command: &Path,
@@ -62,8 +62,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
process_status_check: F,
) -> anyhow::Result<Child>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
F: Fn() -> anyhow::Result<bool>,
AI: IntoIterator<Item = A>,
A: AsRef<OsStr>,
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -90,7 +89,7 @@ where
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file {
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(path) => {
pre_exec_create_pidfile(filled_cmd, path);
path
@@ -108,7 +107,7 @@ where
);
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check).await {
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
@@ -317,20 +316,22 @@ where
cmd
}
async fn process_started<F, Fut>(
fn process_started<F>(
pid: Pid,
pid_file_to_check: &Utf8Path,
pid_file_to_check: Option<&Utf8Path>,
status_check: &F,
) -> anyhow::Result<bool>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
F: Fn() -> anyhow::Result<bool>,
{
match status_check().await {
Ok(true) => match pid_file::read(pid_file_to_check)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
},
None => Ok(true),
},
Ok(false) => Ok(false),
Err(e) => anyhow::bail!("process failed to start: {e}"),

View File

@@ -9,7 +9,6 @@ use clap::Parser;
use hex::FromHex;
use hyper::StatusCode;
use hyper::{Body, Request, Response};
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::{collections::HashMap, sync::Arc};
@@ -174,8 +173,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
if state.pageserver == Some(reattach_req.node_id) {
state.generation += 1;
response.tenants.push(ReAttachResponseTenant {
// TODO(sharding): make this shard-aware
id: TenantShardId::unsharded(*t),
id: *t,
gen: state.generation,
});
}
@@ -198,15 +196,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
};
for req_tenant in validate_req.tenants {
// TODO(sharding): make this shard-aware
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == req_tenant.gen;
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {})",
req_tenant.id,
req_tenant.gen,
tenant_state.generation
);
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid,
@@ -256,13 +247,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
tenant_state.pageserver = attach_req.node_id;
let generation = tenant_state.generation;
tracing::info!(
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
attach_req.tenant_id,
tenant_state.generation,
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
locked.save().await.map_err(ApiError::InternalServerError)?;
json_response(
@@ -302,7 +286,6 @@ async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stdout,
)?;
let args = Cli::parse();

View File

@@ -120,20 +120,15 @@ fn main() -> Result<()> {
let mut env = LocalEnv::load_config().context("Error loading config")?;
let original_env = env.clone();
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"tenant" => handle_tenant(sub_args, &mut env),
"timeline" => handle_timeline(sub_args, &mut env),
"start" => handle_start_all(sub_args, &env),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
"pageserver" => handle_pageserver(sub_args, &env),
"attachment_service" => handle_attachment_service(sub_args, &env),
"safekeeper" => handle_safekeeper(sub_args, &env),
"endpoint" => handle_endpoint(sub_args, &env),
"mappings" => handle_mappings(sub_args, &mut env),
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
_ => bail!("unexpected subcommand {sub_name}"),
@@ -173,7 +168,7 @@ fn print_timelines_tree(
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
},
)
})
@@ -274,13 +269,12 @@ fn print_timeline(
/// Returns a map of timeline IDs to timeline_id@lsn strings.
/// Connects to the pageserver to query this information.
async fn get_timeline_infos(
fn get_timeline_infos(
env: &local_env::LocalEnv,
tenant_id: &TenantId,
) -> Result<HashMap<TimelineId, TimelineInfo>> {
Ok(get_default_pageserver(env)
.timeline_list(tenant_id)
.await?
.timeline_list(tenant_id)?
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
.collect())
@@ -379,14 +373,11 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
.collect()
}
async fn handle_tenant(
tenant_match: &ArgMatches,
env: &mut local_env::LocalEnv,
) -> anyhow::Result<()> {
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
let pageserver = get_default_pageserver(env);
match tenant_match.subcommand() {
Some(("list", _)) => {
for t in pageserver.tenant_list().await? {
for t in pageserver.tenant_list()? {
println!("{} {:?}", t.id, t.state);
}
}
@@ -403,16 +394,12 @@ async fn handle_tenant(
// We must register the tenant with the attachment service, so
// that when the pageserver restarts, it will be re-attached.
let attachment_service = AttachmentService::from_env(env);
attachment_service
.attach_hook(tenant_id, pageserver.conf.id)
.await?
attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
} else {
None
};
pageserver
.tenant_create(tenant_id, generation, tenant_conf)
.await?;
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
println!("tenant {tenant_id} successfully created on the pageserver");
// Create an initial timeline for the new tenant
@@ -422,16 +409,13 @@ async fn handle_tenant(
.copied()
.context("Failed to parse postgres version from the argument string")?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
None,
)
.await?;
let timeline_info = pageserver.timeline_create(
tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
)?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -465,7 +449,6 @@ async fn handle_tenant(
pageserver
.tenant_config(tenant_id, tenant_conf)
.await
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured on the pageserver");
}
@@ -474,7 +457,7 @@ async fn handle_tenant(
let new_pageserver = get_pageserver(env, matches)?;
let new_pageserver_id = new_pageserver.conf.id;
migrate_tenant(env, tenant_id, new_pageserver).await?;
migrate_tenant(env, tenant_id, new_pageserver)?;
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
}
@@ -484,13 +467,13 @@ async fn handle_tenant(
Ok(())
}
async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let pageserver = get_default_pageserver(env);
match timeline_match.subcommand() {
Some(("list", list_match)) => {
let tenant_id = get_tenant_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_id).await?;
let timelines = pageserver.timeline_list(&tenant_id)?;
print_timelines_tree(timelines, env.timeline_name_mappings())?;
}
Some(("create", create_match)) => {
@@ -504,18 +487,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
.copied()
.context("Failed to parse postgres version from the argument string")?;
let new_timeline_id_opt = parse_timeline_id(create_match)?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
new_timeline_id_opt,
None,
None,
Some(pg_version),
None,
)
.await?;
let timeline_info =
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -560,9 +533,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
let mut cplane = ComputeControlPlane::load(env.clone())?;
println!("Importing timeline into pageserver ...");
pageserver
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
.await?;
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
println!("Creating endpoint for imported timeline ...");
@@ -598,16 +569,13 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let timeline_info = pageserver
.timeline_create(
tenant_id,
None,
start_lsn,
Some(ancestor_timeline_id),
None,
None,
)
.await?;
let timeline_info = pageserver.timeline_create(
tenant_id,
None,
start_lsn,
Some(ancestor_timeline_id),
None,
)?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -626,22 +594,22 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
Ok(())
}
async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match ep_match.subcommand() {
Some(ep_subcommand_data) => ep_subcommand_data,
None => bail!("no endpoint subcommand provided"),
};
let mut cplane = ComputeControlPlane::load(env.clone())?;
// All subcommands take an optional --tenant-id option
let tenant_id = get_tenant_id(sub_args, env)?;
match sub_name {
"list" => {
let tenant_id = get_tenant_id(sub_args, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_id)
.await
.unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
let timeline_name_mappings = env.timeline_name_mappings();
@@ -697,7 +665,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
println!("{table}");
}
"create" => {
let tenant_id = get_tenant_id(sub_args, env)?;
let branch_name = sub_args
.get_one::<String>("branch-name")
.map(|s| s.as_str())
@@ -742,18 +709,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
match (mode, hot_standby) {
(ComputeMode::Static(_), true) => {
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
}
(ComputeMode::Primary, true) => {
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
}
_ => {}
}
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
cplane.new_endpoint(
&endpoint_id,
tenant_id,
@@ -766,6 +721,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
)?;
}
"start" => {
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let endpoint_id = sub_args
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -794,30 +751,80 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
env.safekeepers.iter().map(|sk| sk.id).collect()
};
let endpoint = cplane
.endpoints
.get(endpoint_id.as_str())
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
endpoint.timeline_id,
)?;
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
} else {
None
};
println!("Starting existing endpoint {endpoint_id}...");
endpoint
.start(&auth_token, safekeepers, remote_ext_config)
.await?;
let hot_standby = sub_args
.get_one::<bool>("hot-standby")
.copied()
.unwrap_or(false);
if let Some(endpoint) = endpoint {
match (&endpoint.mode, hot_standby) {
(ComputeMode::Static(_), true) => {
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
}
(ComputeMode::Primary, true) => {
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
}
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
.map(|s| s.as_str())
.unwrap_or(DEFAULT_BRANCH_NAME);
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| {
anyhow!("Found no timeline id for branch name '{branch_name}'")
})?;
let lsn = sub_args
.get_one::<String>("lsn")
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse Lsn from the request")?;
let pg_version = sub_args
.get_one::<u32>("pg-version")
.copied()
.context("Failed to `pg-version` from the argument string")?;
let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica,
(None, false) => ComputeMode::Primary,
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
// when used with custom port this results in non obvious behaviour
// port is remembered from first start command, i e
// start --port X
// stop
// start <-- will also use port X even without explicit port argument
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
let ep = cplane.new_endpoint(
endpoint_id,
tenant_id,
timeline_id,
pg_port,
http_port,
pg_version,
mode,
pageserver_id,
)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
}
}
"reconfigure" => {
let endpoint_id = sub_args
@@ -835,7 +842,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
} else {
None
};
endpoint.reconfigure(pageserver_id).await?;
endpoint.reconfigure(pageserver_id)?;
}
"stop" => {
let endpoint_id = sub_args
@@ -901,12 +908,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
))
}
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
@@ -933,10 +939,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -950,17 +953,14 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await {
match get_pageserver(env, subcommand_args)?.check_status() {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
@@ -975,14 +975,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_attachment_service(
sub_match: &ArgMatches,
env: &local_env::LocalEnv,
) -> Result<()> {
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let svc = AttachmentService::from_env(env);
match sub_match.subcommand() {
Some(("start", _start_match)) => {
if let Err(e) = svc.start().await {
if let Err(e) = svc.start() {
eprintln!("start failed: {e}");
exit(1);
}
@@ -1023,7 +1020,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
.collect()
}
async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(safekeeper_command_data) => safekeeper_command_data,
None => bail!("no safekeeper subcommand provided"),
@@ -1041,7 +1038,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
"start" => {
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts).await {
if let Err(e) = safekeeper.start(extra_opts) {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1067,7 +1064,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts).await {
if let Err(e) = safekeeper.start(extra_opts) {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1080,15 +1077,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically
broker::start_broker_process(env).await?;
broker::start_broker_process(env)?;
// Only start the attachment service if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start().await {
if let Err(e) = attachment_service.start() {
eprintln!("attachment_service start failed: {:#}", e);
try_stop_all(env, true);
exit(1);
@@ -1097,10 +1094,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match))
.await
{
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true);
exit(1);
@@ -1109,7 +1103,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]).await {
if let Err(e) = safekeeper.start(vec![]) {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false);
exit(1);
@@ -1251,7 +1245,7 @@ fn cli() -> Command {
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the remote extensions storage proxy gateway to request for extensions.")
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
@@ -1314,7 +1308,6 @@ fn cli() -> Command {
.subcommand(Command::new("create")
.about("Create a new blank timeline")
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(pg_version_arg.clone())
)
@@ -1436,7 +1429,15 @@ fn cli() -> Command {
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
.arg(endpoint_id_arg.clone())
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(timeline_id_arg.clone())
.arg(lsn_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
.arg(endpoint_pageserver_id_arg.clone())
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
@@ -1449,6 +1450,7 @@ fn cli() -> Command {
.subcommand(
Command::new("stop")
.arg(endpoint_id_arg)
.arg(tenant_id_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")

View File

@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
use crate::{background_process, local_env};
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;
@@ -19,15 +19,15 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
let args = [format!("--listen-addr={listen_addr}")];
let client = reqwest::Client::new();
let client = reqwest::blocking::Client::new();
background_process::start_process(
"storage_broker",
&env.base_data_dir,
&env.storage_broker_bin(),
args,
[],
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|| async {
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|| {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
format!("Failed to append /status path to broker endpoint {url}")
@@ -36,13 +36,12 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
.get(status_url)
.build()
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
match client.execute(request).await {
match client.execute(request) {
Ok(resp) => Ok(resp.status().is_success()),
Err(_) => Ok(false),
}
},
)
.await
.context("Failed to spawn storage_broker subprocess")?;
Ok(())
}

View File

@@ -45,7 +45,6 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::RemoteExtSpec;
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
@@ -125,7 +124,6 @@ impl ComputeControlPlane {
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -170,30 +168,6 @@ impl ComputeControlPlane {
Ok(ep)
}
pub fn check_conflicting_endpoints(
&self,
mode: ComputeMode,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<()> {
if matches!(mode, ComputeMode::Primary) {
// this check is not complete, as you could have a concurrent attempt at
// creating another primary, both reading the state before checking it here,
// but it's better than nothing.
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
v.tenant_id == tenant_id
&& v.timeline_id == timeline_id
&& v.mode == mode
&& v.status() != "stopped"
});
if let Some((key, _)) = duplicates.next() {
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
}
}
Ok(())
}
}
///////////////////////////////////////////////////////////////////////////////
@@ -464,7 +438,7 @@ impl Endpoint {
}
}
pub async fn start(
pub fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
@@ -502,24 +476,11 @@ impl Endpoint {
}
}
// check for file remote_extensions_spec.json
// if it is present, read it and pass to compute_ctl
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
let remote_extensions: Option<RemoteExtSpec>;
if let Ok(spec_file) = remote_extensions_spec {
remote_extensions = serde_json::from_reader(spec_file).ok();
} else {
remote_extensions = None;
};
// Create spec file
let spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: vec![],
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
@@ -536,7 +497,7 @@ impl Endpoint {
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
remote_extensions: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -587,7 +548,7 @@ impl Endpoint {
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
loop {
attempt += 1;
match self.get_status().await {
match self.get_status() {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
@@ -629,8 +590,8 @@ impl Endpoint {
}
// Call the /status HTTP API
pub async fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::Client::new();
pub fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::blocking::Client::new();
let response = client
.request(
@@ -641,17 +602,16 @@ impl Endpoint {
self.http_address.port()
),
)
.send()
.await?;
.send()?;
// Interpret the response
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(response.json().await?)
Ok(response.json()?)
} else {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = response.url().to_owned();
let msg = match response.text().await {
let msg = match response.text() {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
@@ -659,7 +619,7 @@ impl Endpoint {
}
}
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
@@ -688,7 +648,7 @@ impl Endpoint {
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
}
let client = reqwest::Client::new();
let client = reqwest::blocking::Client::new();
let response = client
.post(format!(
"http://{}:{}/configure",
@@ -699,15 +659,14 @@ impl Endpoint {
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.send()
.await?;
.send()?;
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(())
} else {
let url = response.url().to_owned();
let msg = match response.text().await {
let msg = match response.text() {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};

View File

@@ -6,24 +6,27 @@
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::io;
use std::io::Write;
use std::fs::File;
use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::time::Duration;
use std::{io, result};
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
use pageserver_api::models::{
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::auth::{Claims, Scope};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
};
@@ -34,6 +37,45 @@ use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
#[derive(Error, Debug)]
pub enum PageserverHttpError {
#[error("Reqwest error: {0}")]
Transport(#[from] reqwest::Error),
#[error("Error: {0}")]
Response(String),
}
impl From<anyhow::Error> for PageserverHttpError {
fn from(e: anyhow::Error) -> Self {
Self::Response(e.to_string())
}
}
type Result<T> = result::Result<T, PageserverHttpError>;
pub trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> Result<Self>;
}
impl ResponseErrorMessageExt for Response {
fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = self.url().to_owned();
Err(PageserverHttpError::Response(
match self.json::<HttpErrorBody>() {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
))
}
}
//
// Control routines for pageserver.
//
@@ -44,7 +86,8 @@ pub struct PageServerNode {
pub pg_connection_config: PgConnectionConfig,
pub conf: PageServerConf,
pub env: LocalEnv,
pub http_client: mgmt_api::Client,
pub http_client: Client,
pub http_base_url: String,
}
impl PageServerNode {
@@ -56,19 +99,8 @@ impl PageServerNode {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: mgmt_api::Client::new(
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
AuthType::NeonJWT => Some(
env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap(),
),
}
}
.as_deref(),
),
http_client: Client::new(),
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
}
}
@@ -149,8 +181,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false).await
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false)
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -191,12 +223,7 @@ impl PageServerNode {
Ok(())
}
async fn start_node(
&self,
config_overrides: &[&str],
update_config: bool,
) -> anyhow::Result<Child> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}",
@@ -204,7 +231,7 @@ impl PageServerNode {
self.pg_connection_config.raw_address(),
datadir
);
io::stdout().flush().context("flush stdout")?;
io::stdout().flush()?;
let datadir_path_str = datadir.to_str().with_context(|| {
format!(
@@ -216,23 +243,20 @@ impl PageServerNode {
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
background_process::start_process(
"pageserver",
&datadir,
&self.env.pageserver_bin(),
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
let st = self.check_status().await;
match st {
Ok(()) => Ok(true),
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
}
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(PageserverHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
.await
}
fn pageserver_basic_args<'a>(
@@ -278,12 +302,7 @@ impl PageServerNode {
background_process::stop_process(immediate, "pageserver", &self.pid_file())
}
pub async fn page_server_psql_client(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
let mut config = self.pg_connection_config.clone();
if self.conf.pg_auth_type == AuthType::NeonJWT {
let token = self
@@ -291,18 +310,36 @@ impl PageServerNode {
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
config = config.set_password(Some(token));
}
Ok(config.connect_no_tls().await?)
Ok(config.connect_no_tls()?)
}
pub async fn check_status(&self) -> mgmt_api::Result<()> {
self.http_client.status().await
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
let mut builder = self.http_client.request(method, url);
if self.conf.http_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
builder = builder.bearer_auth(token)
}
Ok(builder)
}
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
self.http_client.list_tenants().await
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
.send()?
.error_from_body()?;
Ok(())
}
pub async fn tenant_create(
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
Ok(self
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
.send()?
.error_from_body()?
.json()?)
}
pub fn tenant_create(
&self,
new_tenant_id: TenantId,
generation: Option<u32>,
@@ -369,7 +406,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
};
let request = models::TenantCreateRequest {
@@ -380,10 +416,23 @@ impl PageServerNode {
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
Ok(self.http_client.tenant_create(&request).await?)
self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
.json(&request)
.send()?
.error_from_body()?
.json::<Option<String>>()
.with_context(|| {
format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
})?
.context("No tenant id was found in the tenant creation response")
.and_then(|tenant_id_string| {
tenant_id_string.parse().with_context(|| {
format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
})
})
}
pub async fn tenant_config(
pub fn tenant_config(
&self,
tenant_id: TenantId,
mut settings: HashMap<&str, &str>,
@@ -454,7 +503,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
}
};
@@ -462,48 +510,80 @@ impl PageServerNode {
bail!("Unrecognized tenant settings: {settings:?}")
}
self.http_client
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
.await?;
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
.json(&models::TenantConfigRequest { tenant_id, config })
.send()?
.error_from_body()?;
Ok(())
}
pub async fn location_config(
pub fn location_config(
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<Duration>,
) -> anyhow::Result<()> {
Ok(self
.http_client
.location_config(tenant_id, config, flush_ms)
.await?)
let req_body = TenantLocationConfigRequest { tenant_id, config };
self.http_request(
Method::PUT,
format!(
"{}/tenant/{}/location_config",
self.http_base_url, tenant_id
),
)?
.json(&req_body)
.send()?
.error_from_body()?;
Ok(())
}
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
Ok(self.http_client.list_timelines(*tenant_id).await?)
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
let timeline_infos: Vec<TimelineInfo> = self
.http_request(
Method::GET,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)?
.send()?
.error_from_body()?
.json()?;
Ok(timeline_infos)
}
pub async fn timeline_create(
pub fn timeline_create(
&self,
tenant_id: TenantId,
new_timeline_id: Option<TimelineId>,
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
existing_initdb_timeline_id: Option<TimelineId>,
) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
let req = models::TimelineCreateRequest {
self.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)?
.json(&models::TimelineCreateRequest {
new_timeline_id,
ancestor_start_lsn,
ancestor_timeline_id,
pg_version,
existing_initdb_timeline_id,
};
Ok(self.http_client.timeline_create(tenant_id, &req).await?)
})
.send()?
.error_from_body()?
.json::<Option<TimelineInfo>>()
.with_context(|| {
format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
})?
.with_context(|| {
format!(
"No timeline id was found in the timeline creation response for tenant {tenant_id}"
)
})
}
/// Import a basebackup prepared using either:
@@ -515,7 +595,7 @@ impl PageServerNode {
/// * `timeline_id` - id to assign to imported timeline
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
pub async fn timeline_import(
pub fn timeline_import(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -523,60 +603,36 @@ impl PageServerNode {
pg_wal: Option<(Lsn, PathBuf)>,
pg_version: u32,
) -> anyhow::Result<()> {
let (client, conn) = self.page_server_psql_client().await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
tokio::pin!(client);
let mut client = self.page_server_psql_client()?;
// Init base reader
let (start_lsn, base_tarfile_path) = base;
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
let base_tarfile = File::open(base_tarfile_path)?;
let mut base_reader = BufReader::new(base_tarfile);
// Init wal reader if necessary
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
let wal_tarfile = File::open(wal_tarfile_path)?;
let wal_reader = BufReader::new(wal_tarfile);
(end_lsn, Some(wal_reader))
} else {
(start_lsn, None)
};
let copy_in = |reader, cmd| {
let client = &client;
async move {
let writer = client.copy_in(&cmd).await?;
let writer = std::pin::pin!(writer);
let mut writer = writer.sink_map_err(|e| {
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
});
let mut reader = std::pin::pin!(reader);
writer.send_all(&mut reader).await?;
writer.into_inner().finish().await?;
anyhow::Ok(())
}
};
// Import base
copy_in(
base_tarfile,
format!(
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
),
)
.await?;
let import_cmd = format!(
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
);
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut base_reader, &mut writer)?;
writer.finish()?;
// Import wal if necessary
if let Some(wal_reader) = wal_reader {
copy_in(
wal_reader,
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
)
.await?;
if let Some(mut wal_reader) = wal_reader {
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut wal_reader, &mut writer)?;
writer.finish()?;
}
Ok(())

View File

@@ -13,6 +13,7 @@ use std::{io, result};
use anyhow::Context;
use camino::Utf8PathBuf;
use postgres_connection::PgConnectionConfig;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -33,14 +34,12 @@ pub enum SafekeeperHttpError {
type Result<T> = result::Result<T, SafekeeperHttpError>;
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
async fn error_from_body(self) -> Result<Self>;
fn error_from_body(self) -> Result<Self>;
}
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(self) -> Result<Self> {
impl ResponseErrorMessageExt for Response {
fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
@@ -49,7 +48,7 @@ impl ResponseErrorMessageExt for reqwest::Response {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = self.url().to_owned();
Err(SafekeeperHttpError::Response(
match self.json::<HttpErrorBody>().await {
match self.json::<HttpErrorBody>() {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
@@ -70,7 +69,7 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv,
pub http_client: reqwest::Client,
pub http_client: Client,
pub http_base_url: String,
}
@@ -81,7 +80,7 @@ impl SafekeeperNode {
conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(),
http_client: reqwest::Client::new(),
http_client: Client::new(),
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
}
}
@@ -104,7 +103,7 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),
@@ -192,16 +191,13 @@ impl SafekeeperNode {
&self.env.safekeeper_bin(),
&args,
[],
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
match self.check_status().await {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
}
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
.await
}
///
@@ -220,7 +216,7 @@ impl SafekeeperNode {
)
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
// TODO: authentication
//if self.env.auth_type == AuthType::NeonJWT {
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -228,12 +224,10 @@ impl SafekeeperNode {
self.http_client.request(method, url)
}
pub async fn check_status(&self) -> Result<()> {
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
.send()
.await?
.error_from_body()
.await?;
.send()?
.error_from_body()?;
Ok(())
}
}

View File

@@ -14,16 +14,17 @@ use pageserver_api::models::{
use std::collections::HashMap;
use std::time::Duration;
use utils::{
generation::Generation,
id::{TenantId, TimelineId},
lsn::Lsn,
};
/// Given an attached pageserver, retrieve the LSN for all timelines
async fn get_lsns(
fn get_lsns(
tenant_id: TenantId,
pageserver: &PageServerNode,
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
let timelines = pageserver.timeline_list(&tenant_id).await?;
let timelines = pageserver.timeline_list(&tenant_id)?;
Ok(timelines
.into_iter()
.map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +33,13 @@ async fn get_lsns(
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
/// `baseline`.
async fn await_lsn(
fn await_lsn(
tenant_id: TenantId,
pageserver: &PageServerNode,
baseline: HashMap<TimelineId, Lsn>,
) -> anyhow::Result<()> {
loop {
let latest = match get_lsns(tenant_id, pageserver).await {
let latest = match get_lsns(tenant_id, pageserver) {
Ok(l) => l,
Err(e) => {
println!(
@@ -84,7 +85,7 @@ async fn await_lsn(
/// - Coordinate attach/secondary/detach on pageservers
/// - call into attachment_service for generations
/// - reconfigure compute endpoints to point to new attached pageserver
pub async fn migrate_tenant(
pub fn migrate_tenant(
env: &LocalEnv,
tenant_id: TenantId,
dest_ps: PageServerNode,
@@ -92,60 +93,52 @@ pub async fn migrate_tenant(
// Get a new generation
let attachment_service = AttachmentService::from_env(env);
fn build_location_config(
mode: LocationConfigMode,
generation: Option<u32>,
secondary_conf: Option<LocationConfigSecondary>,
) -> LocationConfig {
LocationConfig {
mode,
generation,
secondary_conf,
tenant_conf: TenantConfig::default(),
shard_number: 0,
shard_count: 0,
shard_stripe_size: 0,
}
}
let previous = attachment_service.inspect(tenant_id).await?;
let previous = attachment_service.inspect(tenant_id)?;
let mut baseline_lsns = None;
if let Some((generation, origin_ps_id)) = &previous {
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
if origin_ps_id == &dest_ps.conf.id {
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None).await?;
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");
return Ok(());
}
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
let stale_conf =
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
origin_ps
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
.await?;
let stale_conf = LocationConfig {
mode: LocationConfigMode::AttachedStale,
generation: Some(Generation::new(*generation)),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
origin_ps.location_config(tenant_id, stale_conf)?;
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
}
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedMulti,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps.location_config(tenant_id, dest_conf, None).await?;
dest_ps.location_config(tenant_id, dest_conf)?;
if let Some(baseline) = baseline_lsns {
println!("🕑 Waiting for LSN to catch up...");
await_lsn(tenant_id, &dest_ps, baseline).await?;
await_lsn(tenant_id, &dest_ps, baseline)?;
}
let cplane = ComputeControlPlane::load(env.clone())?;
@@ -155,7 +148,7 @@ pub async fn migrate_tenant(
"🔁 Reconfiguring endpoint {} to use pageserver {}",
endpoint_name, dest_ps.conf.id
);
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
endpoint.reconfigure(Some(dest_ps.conf.id))?;
}
}
@@ -165,39 +158,43 @@ pub async fn migrate_tenant(
}
let other_ps = PageServerNode::from_env(env, other_ps_conf);
let other_ps_tenants = other_ps.tenant_list().await?;
let other_ps_tenants = other_ps.tenant_list()?;
// Check if this tenant is attached
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i.tenant_id == tenant_id);
.any(|i| i == tenant_id);
if !found {
continue;
}
// Downgrade to a secondary location
let secondary_conf = build_location_config(
LocationConfigMode::Secondary,
None,
Some(LocationConfigSecondary { warm: true }),
);
let secondary_conf = LocationConfig {
mode: LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
tenant_conf: TenantConfig::default(),
};
println!(
"💤 Switching to secondary mode on pageserver {}",
other_ps.conf.id
);
other_ps
.location_config(tenant_id, secondary_conf, None)
.await?;
other_ps.location_config(tenant_id, secondary_conf)?;
}
println!(
"🔁 Switching to AttachedSingle mode on pageserver {}",
dest_ps.conf.id
);
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None).await?;
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");

View File

@@ -1,205 +0,0 @@
# Name
Created on: 2023-09-08
Author: Arpad Müller
## Summary
Enable the pageserver to recover from data corruption events by implementing
a feature to re-apply historic WAL records in parallel to the already occurring
WAL replay.
The feature is outside of the user-visible backup and history story, and only
serves as a second-level backup for the case that there is a bug in the
pageservers that corrupted the served pages.
The RFC proposes the addition of two new features:
* recover a broken branch from WAL (downtime is allowed)
* a test recovery system to recover random branches to make sure recovery works
## Motivation
The historic WAL is currently stored in S3 even after it has been replayed by
the pageserver and thus been integrated into the pageserver's storage system.
This is done to defend from data corruption failures inside the pageservers.
However, application of this WAL in the disaster recovery setting is currently
very manual and we want to automate this to make it easier.
### Use cases
There are various use cases for this feature, like:
* The main motivation is replaying in the instance of pageservers corrupting
data.
* We might want to, beyond the user-visible history features, through our
support channels and upon customer request, in select instances, recover
historic versions beyond the range of history that we officially support.
* Running the recovery process in the background for random tenant timelines
to figure out if there was a corruption of data (we would compare with what
the pageserver stores for the "official" timeline).
* Using the WAL to arrive at historic pages we can then back up to S3 so that
WAL itself can be discarded, or at least not used for future replays.
Again, this sounds a lot like what the pageserver is already doing, but the
point is to provide a fallback to the service provided by the pageserver.
## Design
### Design constraints
The main design constraint is that the feature needs to be *simple* enough that
the number of bugs are as low, and reliability as high as possible: the main
goal of this endeavour is to achieve higher correctness than the pageserver.
For the background process, we cannot afford a downtime of the timeline that is
being cloned, as we don't want to restrict ourselves to offline tenants only.
In the scenario where we want to recover from disasters or roll back to a
historic lsn through support staff, downtimes are more affordable, and
inevitable if the original had been subject to the corruption. Ideally, the
two code paths would share code, so the solution would be designed for not
requiring downtimes.
### API endpoint changes
This RFC proposes two API endpoint changes in the safekeeper and the
pageserver.
Remember, the pageserver timeline API creation endpoint is to this URL:
```
/v1/tenant/{tenant_id}/timeline/
```
Where `{tenant_id}` is the ID of the tenant the timeline is created for,
and specified as part of the URL. The timeline ID is passed via the POST
request body as the only required parameter `new_timeline_id`.
This proposal adds one optional parameter called
`existing_initdb_timeline_id` to the request's json body. If the parameter
is not specified, behaviour should be as existing, so the pageserver runs
initdb.
If the parameter is specified, it is expected to point to a timeline ID.
In fact that ID might match `new_timeline_id`, what's important is that
S3 storage contains a matching initdb under the URL matching the given
tenant and timeline.
Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
specified is illegal and will yield in an HTTP error. This feature is
only meant for the "main" branch that doesn't have any ancestors
of its own, as only here initdb is relevant.
For the safekeeper, we propose the addition of the following copy endpoint:
```
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
```
it is meant for POST requests with json, and the two URL parameters
`tenant_id` and `source_timeline_id`. The json request body contains
the two required parameters `target_timeline_id` and `until_lsn`.
After invoking, the copy endpoint starts a copy process of the WAL from
the source ID to the target ID. The lsn is updated according to the
progress of the API call.
### Higher level features
We want the API changes to support the following higher level features:
* recovery-after-corruption DR of the main timeline of a tenant. This
feature allows for downtime.
* test DR of the main timeline into a special copy timeline. this feature
is meant to run against selected production tenants in the background,
without the user noticing, so it does not allow for downtime.
The recovery-after-corruption DR only needs the pageserver changes.
It works as follows:
* delete the timeline from the pageservers via timeline deletion API
* re-create it via timeline creation API (same ID as before) and set
`existing_initdb_timeline_id` to the same timeline ID
The test DR requires also the copy primitive and works as follows:
* copy the WAL of the timeline to a new place
* create a new timeline for the tenant
## Non Goals
At the danger of being repetitive, the main goal of this feature is to be a
backup method, so reliability is very important. This implies that other
aspects like performance or space reduction are less important.
### Corrupt WAL
The process suggested by this RFC assumes that the WAL is free of corruption.
In some instances, corruption can make it into WAL, like for example when
higher level components like postgres or the application first read corrupt
data, and then execute a write with data derived from that earlier read. That
written data might then contain the corruption.
Common use cases can hit this quite easily. For example, an application reads
some counter, increments it, and then writes the new counter value to the
database.
On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
which have corrupt data for rows unrelated to the write operation at hand.
Separating corrupt writes from non-corrupt ones is a hard problem in general,
and if the application was involved in making the corrupt write, a recovery
would also involve the application. Therefore, corruption that has made it into
the WAL is outside of the scope of this feature. However, the WAL replay can be
issued to right before the point in time where the corruption occured. Then the
data loss is isolated to post-corruption writes only.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Most changes would happen to the pageservers.
For the higher level features, maybe other components like the console would
be involved.
We need to make sure that the shadow timelines are not subject to the usual
limits and billing we apply to existing timelines.
## Proposed implementation
The first problem to keep in mind is the reproducability of `initdb`.
So an initial step would be to upload `initdb` snapshots to S3.
After that, we'd have the endpoint spawn a background process which
performs the replay of the WAL to that new timeline. This process should
follow the existing workflows as closely as possible, just using the
WAL records of a different timeline.
The timeline created will be in a special state that solely looks for WAL
entries of the timeline it is trying to copy. Once the target LSN is reached,
it turns into a normal timeline that also accepts writes to its own
timeline ID.
### Scalability
For now we want to run this entire process on a single node, and as
it is by nature linear, it's hard to parallelize. However, for the
verification workloads, we can easily start the WAL replay in parallel
for different points in time. This is valuable especially for tenants
with large WAL records.
Compare this with the tricks to make addition circuits execute with
lower latency by making them perform the addition for both possible
values of the carry bit, and then, in a second step, taking the
result for the carry bit that was actually obtained.
The other scalability dimension to consider is the WAL length, which
is a growing question as tenants accumulate changes. There are
possible approaches to this, including creating snapshots of the
page files and uploading them to S3, but if we do this for every single
branch, we lose the cheap branching property.
### Implementation by component
The proposed changes for the various components of the neon architecture
are written up in this notion page:
https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
### Unresolved questions
none known (outside of the mentioned ones).

View File

@@ -26,13 +26,6 @@ pub struct ComputeSpec {
// but we don't use it for anything. Serde will ignore missing fields when
// deserializing it.
pub operation_uuid: Option<String>,
/// Compute features to enable. These feature flags are provided, when we
/// know all the details about client's compute, so they cannot be used
/// to change `Empty` compute behavior.
#[serde(default)]
pub features: Vec<ComputeFeature>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
pub remote_extensions: Option<RemoteExtSpec>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeFeature {
// XXX: Add more feature flags here.
// This is a special feature flag that is used to represent unknown feature flags.
// Basically all unknown to enum flags are represented as this one. See unit test
// `parse_unknown_features()` for more details.
#[serde(other)]
UnknownFeature,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
@@ -207,6 +187,8 @@ pub struct DeltaOp {
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
pub replication: Option<bool>,
pub bypassrls: Option<bool>,
pub options: GenericOptions,
}
@@ -247,10 +229,7 @@ mod tests {
#[test]
fn parse_spec_file() {
let file = File::open("tests/cluster_spec.json").unwrap();
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
// Features list defaults to empty vector.
assert!(spec.features.is_empty());
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
}
#[test]
@@ -262,22 +241,4 @@ mod tests {
ob.insert("unknown_field_123123123".into(), "hello".into());
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
}
#[test]
fn parse_unknown_features() {
// Test that unknown feature flags do not cause any errors.
let file = File::open("tests/cluster_spec.json").unwrap();
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
let ob = json.as_object_mut().unwrap();
// Add unknown feature flags.
let features = vec!["foo_bar_feature", "baz_feature"];
ob.insert("features".into(), features.into());
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
assert!(spec.features.len() == 2);
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
}
}

View File

@@ -3,11 +3,8 @@
//! Otherwise, we might not see all metrics registered via
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
};
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
@@ -135,137 +132,3 @@ fn get_rusage_stats() -> libc::rusage {
rusage.assume_init()
}
}
/// Create an [`IntCounterPairVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair_vec {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
match (
$crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
$crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
match (
$crate::register_int_counter!($NAME1, $HELP1),
$crate::register_int_counter!($NAME2, $HELP2),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
pub struct GenericCounterPairVec<P: Atomic> {
inc: GenericCounterVec<P>,
dec: GenericCounterVec<P>,
}
/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
pub struct GenericCounterPair<P: Atomic> {
inc: GenericCounter<P>,
dec: GenericCounter<P>,
}
impl<P: Atomic> GenericCounterPairVec<P> {
pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
Self { inc, dec }
}
/// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
})
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<P: Atomic> GenericCounterPair<P> {
pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
Self { inc, dec }
}
/// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
pub fn guard(&self) -> GenericCounterPairGuard<P> {
self.inc.inc();
GenericCounterPairGuard(self.dec.clone())
}
/// Increment the gauge by n, returning a guard that decrements by n on drop.
pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
self.inc.inc_by(n);
GenericCounterPairGuardBy(self.dec.clone(), n)
}
/// Increase the gauge by 1.
#[inline]
pub fn inc(&self) {
self.inc.inc();
}
/// Decrease the gauge by 1.
#[inline]
pub fn dec(&self) {
self.dec.inc();
}
/// Add the given value to the gauge. (The value can be
/// negative, resulting in a decrement of the gauge.)
#[inline]
pub fn inc_by(&self, v: P::T) {
self.inc.inc_by(v);
}
/// Subtract the given value from the gauge. (The value can be
/// negative, resulting in an increment of the gauge.)
#[inline]
pub fn dec_by(&self, v: P::T) {
self.dec.inc_by(v);
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
fn drop(&mut self) {
self.0.inc();
}
}
/// Guard returned by [`GenericCounterPair::guard_by`]
pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
fn drop(&mut self) {
self.0.inc_by(self.1);
}
}
/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;

View File

@@ -18,10 +18,8 @@ enum-map.workspace = true
strum.workspace = true
strum_macros.workspace = true
hex.workspace = true
thiserror.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
bincode.workspace = true
rand.workspace = true

View File

@@ -4,9 +4,7 @@
//! See docs/rfcs/025-generation-numbers.md
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::shard::TenantShardId;
use utils::id::{NodeId, TenantId};
#[derive(Serialize, Deserialize)]
pub struct ReAttachRequest {
@@ -15,7 +13,7 @@ pub struct ReAttachRequest {
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant {
pub id: TenantShardId,
pub id: TenantId,
pub gen: u32,
}
@@ -26,7 +24,7 @@ pub struct ReAttachResponse {
#[derive(Serialize, Deserialize)]
pub struct ValidateRequestTenant {
pub id: TenantShardId,
pub id: TenantId,
pub gen: u32,
}
@@ -42,6 +40,6 @@ pub struct ValidateResponse {
#[derive(Serialize, Deserialize)]
pub struct ValidateResponseTenant {
pub id: TenantShardId,
pub id: TenantId,
pub valid: bool,
}

View File

@@ -140,41 +140,3 @@ impl Key {
})
}
}
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
Self::from_hex(s)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use crate::key::Key;
use rand::Rng;
use rand::SeedableRng;
#[test]
fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
let key = Key {
field1: rng.gen(),
field2: rng.gen(),
field3: rng.gen(),
field4: rng.gen(),
field5: rng.gen(),
field6: rng.gen(),
};
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
}
}

View File

@@ -5,7 +5,6 @@ use const_format::formatcp;
/// Public API types
pub mod control_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod reltag;
pub mod shard;

View File

@@ -1,5 +1,3 @@
pub mod partitioning;
use std::{
collections::HashMap,
num::{NonZeroU64, NonZeroUsize},
@@ -12,6 +10,7 @@ use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
generation::Generation,
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
@@ -181,8 +180,6 @@ pub struct TimelineCreateRequest {
#[serde(default)]
pub ancestor_timeline_id: Option<TimelineId>,
#[serde(default)]
pub existing_initdb_timeline_id: Option<TimelineId>,
#[serde(default)]
pub ancestor_start_lsn: Option<Lsn>,
pub pg_version: Option<u32>,
}
@@ -239,7 +236,6 @@ pub struct TenantConfig {
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -266,19 +262,10 @@ pub struct LocationConfig {
pub mode: LocationConfigMode,
/// If attaching, in what generation?
#[serde(default)]
pub generation: Option<u32>,
pub generation: Option<Generation>,
#[serde(default)]
pub secondary_conf: Option<LocationConfigSecondary>,
// Shard parameters: if shard_count is nonzero, then other shard_* fields
// must be set accurately.
#[serde(default)]
pub shard_number: u8,
#[serde(default)]
pub shard_count: u8,
#[serde(default)]
pub shard_stripe_size: u32,
// If requesting mode `Secondary`, configuration for that.
// Custom storage configuration for the tenant, if any
pub tenant_conf: TenantConfig,
@@ -319,14 +306,31 @@ impl std::ops::Deref for TenantConfigRequest {
impl TenantConfigRequest {
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
let config = TenantConfig::default();
let config = TenantConfig {
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
compaction_period: None,
compaction_threshold: None,
gc_horizon: None,
gc_period: None,
image_creation_threshold: None,
pitr_interval: None,
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
eviction_policy: None,
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
gc_feedback: None,
};
TenantConfigRequest { tenant_id, config }
}
}
#[derive(Debug, Deserialize)]
pub struct TenantAttachRequest {
#[serde(default)]
pub config: TenantAttachConfig,
#[serde(default)]
pub generation: Option<u32>,
@@ -334,7 +338,7 @@ pub struct TenantAttachRequest {
/// Newtype to enforce deny_unknown_fields on TenantConfig for
/// its usage inside `TenantAttachRequest`.
#[derive(Debug, Serialize, Deserialize, Default)]
#[derive(Debug, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct TenantAttachConfig {
#[serde(flatten)]
@@ -360,7 +364,7 @@ pub enum TenantAttachmentStatus {
#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
pub id: TenantShardId,
pub id: TenantId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState,
/// Sum of the size of all layer files.
@@ -372,7 +376,7 @@ pub struct TenantInfo {
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
pub tenant_id: TenantShardId,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub ancestor_timeline_id: Option<TimelineId>,
@@ -388,12 +392,7 @@ pub struct TimelineInfo {
/// The LSN that we are advertizing to safekeepers
pub remote_consistent_lsn_visible: Lsn,
/// The LSN from the start of the root timeline (never changes)
pub initdb_lsn: Lsn,
pub current_logical_size: u64,
pub current_logical_size_is_accurate: bool,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
@@ -829,7 +828,7 @@ mod tests {
fn test_tenantinfo_serde() {
// Test serialization/deserialization of TenantInfo
let original_active = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
@@ -846,7 +845,7 @@ mod tests {
});
let original_broken = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Broken {
reason: "reason".into(),
backtrace: "backtrace info".into(),

View File

@@ -1,151 +0,0 @@
use utils::lsn::Lsn;
#[derive(Debug, PartialEq, Eq)]
pub struct Partitioning {
pub keys: crate::keyspace::KeySpace,
pub at_lsn: Lsn,
}
impl serde::Serialize for Partitioning {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
impl<'a> serde::Serialize for KeySpace<'a> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
for kr in &self.0.ranges {
seq.serialize_element(&KeyRange(kr))?;
}
seq.end()
}
}
use serde::ser::SerializeMap;
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
}
}
pub struct WithDisplay<'a, T>(&'a T);
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(&self.0)
}
}
pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
impl<'a> serde::Serialize for KeyRange<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeTuple;
let mut t = serializer.serialize_tuple(2)?;
t.serialize_element(&WithDisplay(&self.0.start))?;
t.serialize_element(&WithDisplay(&self.0.end))?;
t.end()
}
}
impl<'a> serde::Deserialize<'a> for Partitioning {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'a>,
{
pub struct KeySpace(crate::keyspace::KeySpace);
impl<'de> serde::Deserialize<'de> for KeySpace {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
#[serde(transparent)]
struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
struct Range(Key, Key);
let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
Ok(Self(crate::keyspace::KeySpace {
ranges: ranges
.into_iter()
.map(|Range(start, end)| (start.0..end.0))
.collect(),
}))
}
}
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
struct De {
keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn,
}
let de: De = serde::Deserialize::deserialize(deserializer)?;
Ok(Self {
at_lsn: de.at_lsn,
keys: de.keys.0,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_serialization_roundtrip() {
let reference = r#"
{
"keys": [
[
"000000000000000000000000000000000000",
"000000000000000000000000000000000001"
],
[
"000000067F00000001000000000000000000",
"000000067F00000001000000000000000002"
],
[
"030000000000000000000000000000000000",
"030000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160"
}
"#;
let de: Partitioning = serde_json::from_str(reference).unwrap();
let ser = serde_json::to_string(&de).unwrap();
let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
assert_eq!(
ser_de,
serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
);
}
}

View File

@@ -1,15 +1,13 @@
use std::{ops::RangeInclusive, str::FromStr};
use crate::key::{is_rel_block_key, Key};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
pub struct ShardCount(pub u8);
impl ShardCount {
@@ -40,7 +38,7 @@ impl ShardNumber {
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
@@ -73,33 +71,19 @@ impl TenantShardId {
)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
}
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02x}{:02x}",
self.0.shard_number.0, self.0.shard_count.0
)
pub fn shard_slug(&self) -> String {
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Display for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.shard_count != ShardCount(0) {
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
write!(
f,
"{}-{:02x}{:02x}",
self.tenant_id, self.shard_number.0, self.shard_count.0
)
} else {
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
// is distinct from the normal single shard case (shard count == 1).
@@ -155,89 +139,6 @@ impl From<[u8; 18]> for TenantShardId {
}
}
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
shard_number: number,
shard_count: count,
}
}
pub fn unsharded() -> Self {
Self {
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// For use in constructing remote storage paths: concatenate this with a TenantId
/// to get a fully qualified TenantShardId.
///
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
/// that the legacy pre-sharding remote key format is preserved.
pub fn get_suffix(&self) -> String {
if self.is_unsharded() {
"".to_string()
} else {
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
}
impl std::fmt::Display for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Debug for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for ShardIndex {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 1 byte shard number, 1 byte shard count
if s.len() == 4 {
let bytes = s.as_bytes();
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(bytes, &mut shard_parts)?;
Ok(Self {
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 2]> for ShardIndex {
fn from(b: [u8; 2]) -> Self {
Self {
shard_number: ShardNumber(b[0]),
shard_count: ShardCount(b[1]),
}
}
}
impl Serialize for TenantShardId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
@@ -308,261 +209,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
}
}
/// Stripe size in number of pages
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardLayout(u8);
const LAYOUT_V1: ShardLayout = ShardLayout(1);
/// ShardIdentity uses a magic layout value to indicate if it is unusable
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
stripe_size: ShardStripeSize,
layout: ShardLayout,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError {
#[error("Invalid shard count")]
InvalidCount,
#[error("Invalid shard number")]
InvalidNumber,
#[error("Invalid stripe size")]
InvalidStripeSize,
}
impl ShardIdentity {
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
/// tenants. Modern single-shard tenants should not use this: they should
/// have number=0 count=1.
pub fn unsharded() -> Self {
Self {
number: ShardNumber(0),
count: ShardCount(0),
layout: LAYOUT_V1,
stripe_size: DEFAULT_STRIPE_SIZE,
}
}
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
/// which are constructed in code paths that don't have access to proper configuration.
///
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
/// Enforcement is via assertions, to avoid making our interface fallible for this
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
/// state, and by extension to avoid trying to do any page->shard resolution.
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
Self {
number,
count,
layout: LAYOUT_BROKEN,
stripe_size: DEFAULT_STRIPE_SIZE,
}
}
pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0)
}
/// Count must be nonzero, and number must be < count. To construct
/// the legacy case (count==0), use Self::unsharded instead.
pub fn new(
number: ShardNumber,
count: ShardCount,
stripe_size: ShardStripeSize,
) -> Result<Self, ShardConfigError> {
if count.0 == 0 {
Err(ShardConfigError::InvalidCount)
} else if number.0 > count.0 - 1 {
Err(ShardConfigError::InvalidNumber)
} else if stripe_size.0 == 0 {
Err(ShardConfigError::InvalidStripeSize)
} else {
Ok(Self {
number,
count,
layout: LAYOUT_V1,
stripe_size,
})
}
}
fn is_broken(&self) -> bool {
self.layout == LAYOUT_BROKEN
}
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
assert!(!self.is_broken());
key_to_shard_number(self.count, self.stripe_size, key)
}
/// Return true if the key should be ingested by this shard
pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
true
} else {
key_to_shard_number(self.count, self.stripe_size, key) == self.number
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)
} else {
String::new()
}
}
/// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0)
}
}
impl Serialize for ShardIndex {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Binary encoding is not used in index_part.json, but is included in anticipation of
// switching various structures (e.g. inter-process communication, remote metadata) to more
// compact binary encodings in future.
let mut packed: [u8; 2] = [0; 2];
packed[0] = self.shard_number.0;
packed[1] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for ShardIndex {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = ShardIndex;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 2])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 2] = Deserialize::deserialize(s)?;
Ok(ShardIndex::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
ShardIndex::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
2,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
/// in order to be able to serve basebackup requests without peer communication).
fn key_is_shard0(key: &Key) -> bool {
// To decide what to shard out to shards >0, we apply a simple rule that only
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
//
// In this condition:
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
// all metadata.
// - field6 is set to -1 for relation size pages.
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name
fn murmurhash32(mut h: u32) -> u32 {
h ^= h >> 16;
h = h.wrapping_mul(0x85ebca6b);
h ^= h >> 13;
h = h.wrapping_mul(0xc2b2ae35);
h ^= h >> 16;
h
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
b = b.wrapping_add(0x9e3779b9);
b = b.wrapping_add(a << 6);
b = b.wrapping_add(a >> 2);
a ^= b;
a
}
/// Where a Key is to be distributed across shards, select the shard. This function
/// does not account for keys that should be broadcast across shards.
///
/// The hashing in this function must exactly match what we do in postgres smgr
/// code. The resulting distribution of pages is intended to preserve locality within
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
/// distributing data pseudo-randomly.
///
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
/// and will be handled at higher levels when shards are split.
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
// Fast path for un-sharded tenants or broadcast keys
if count < ShardCount(2) || key_is_shard0(key) {
return ShardNumber(0);
}
// relNode
let mut hash = murmurhash32(key.field4);
// blockNum/stripe size
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
ShardNumber((hash % count.0 as u32) as u8)
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
@@ -672,91 +318,4 @@ mod tests {
Ok(())
}
#[test]
fn shard_identity_validation() -> Result<(), ShardConfigError> {
// Happy cases
ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
assert_eq!(
ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidCount)
);
assert_eq!(
ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
Err(ShardConfigError::InvalidStripeSize)
);
Ok(())
}
#[test]
fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
let example = ShardIndex {
shard_number: ShardNumber(13),
shard_count: ShardCount(17),
};
let expected: String = "0d11".to_string();
let encoded = format!("{example}");
assert_eq!(&encoded, &expected);
let decoded = ShardIndex::from_str(&encoded)?;
assert_eq!(example, decoded);
Ok(())
}
#[test]
fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
let example = ShardIndex {
shard_number: ShardNumber(13),
shard_count: ShardCount(17),
};
let expected: [u8; 2] = [0x0d, 0x11];
let encoded = bincode::serialize(&example).unwrap();
assert_eq!(Hex(&encoded), Hex(&expected));
let decoded = bincode::deserialize(&encoded).unwrap();
assert_eq!(example, decoded);
Ok(())
}
// These are only smoke tests to spot check that our implementation doesn't
// deviate from a few examples values: not aiming to validate the overall
// hashing algorithm.
#[test]
fn murmur_hash() {
assert_eq!(murmurhash32(0), 0);
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
}
#[test]
fn shard_mapping() {
let key = Key {
field1: 0x00,
field2: 0x67f,
field3: 0x5,
field4: 0x400c,
field5: 0x00,
field6: 0x7d06,
};
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
assert_eq!(shard, ShardNumber(8));
}
}

View File

@@ -163,18 +163,8 @@ impl PgConnectionConfig {
}
/// Connect using postgres protocol with TLS disabled.
pub async fn connect_no_tls(
&self,
) -> Result<
(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
),
postgres::Error,
> {
self.to_tokio_postgres_config()
.connect(postgres::NoTls)
.await
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
}
}

View File

@@ -289,10 +289,10 @@ impl FeStartupPacket {
// We shouldn't advance `buf` as probably full message is not there yet,
// so can't directly use Bytes::get_u32 etc.
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
// which is less readable
#[allow(clippy::manual_range_contains)]
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
return Err(ProtocolError::Protocol(format!(
"invalid startup packet message length {}",
len
@@ -975,10 +975,4 @@ mod tests {
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
}
#[test]
fn parse_fe_startup_packet_regression() {
let data = [0, 0, 0, 7, 0, 0, 0, 0];
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
}
}

View File

@@ -9,18 +9,18 @@ anyhow.workspace = true
async-trait.workspace = true
once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-types.workspace = true
aws-smithy-http.workspace = true
aws-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
tokio-util = { workspace = true, features = ["compat"] }
tokio-util.workspace = true
toml_edit.workspace = true
tracing.workspace = true
scopeguard.workspace = true

View File

@@ -1,24 +1,21 @@
//! Azure Blob Storage wrapper
use std::borrow::Cow;
use std::collections::HashMap;
use std::env;
use std::num::NonZeroU32;
use std::pin::Pin;
use std::sync::Arc;
use std::{borrow::Cow, io::Cursor};
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::RetryOptions;
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
use bytes::Bytes;
use futures::stream::Stream;
use futures_util::StreamExt;
use http_types::StatusCode;
use tokio::io::AsyncRead;
use tracing::debug;
use crate::s3_bucket::RequestKind;
@@ -52,8 +49,7 @@ impl AzureBlobStorage {
StorageCredentials::token_credential(Arc::new(token_credential))
};
// we have an outer retry
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
let builder = ClientBuilder::new(account, credentials);
let client = builder.container_client(azure_config.container_name.to_owned());
@@ -120,8 +116,7 @@ impl AzureBlobStorage {
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let mut bufs = Vec::new();
let mut buf = Vec::new();
while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?;
if let Some(blob_meta) = part.blob.metadata {
@@ -132,10 +127,10 @@ impl AzureBlobStorage {
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
buf.extend_from_slice(&data.slice(..));
}
Ok(Download {
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
download_stream: Box::pin(Cursor::new(buf)),
metadata: Some(StorageMetadata(metadata)),
})
}
@@ -222,10 +217,9 @@ impl RemoteStorage for AzureBlobStorage {
}
Ok(res)
}
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -233,12 +227,13 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Put).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
Box::pin(from);
let from = NonSeekableStream::new(from, data_size_bytes);
let body = azure_core::Body::SeekableStream(Box::new(from));
// TODO FIX THIS UGLY HACK and don't buffer the entire object
// into RAM here, but use the streaming interface. For that,
// we'd have to change the interface though...
// https://github.com/neondatabase/neon/issues/5563
let mut buf = Vec::with_capacity(data_size_bytes);
tokio::io::copy(&mut from, &mut buf).await?;
let body = azure_core::Body::Bytes(buf.into());
let mut builder = blob_client.put_block_blob(body);
@@ -271,12 +266,17 @@ impl RemoteStorage for AzureBlobStorage {
let mut builder = blob_client.get();
let range: Range = if let Some(end_exclusive) = end_exclusive {
(start_inclusive..end_exclusive).into()
if let Some(end_exclusive) = end_exclusive {
builder = builder.range(Range::new(start_inclusive, end_exclusive));
} else {
(start_inclusive..).into()
};
builder = builder.range(range);
// Open ranges are not supported by the SDK so we work around
// by setting the upper limit extremely high (but high enough
// to still be representable by signed 64 bit integers).
// TODO remove workaround once the SDK adds open range support
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
let end_exclusive = u64::MAX / 4;
builder = builder.range(Range::new(start_inclusive, end_exclusive));
}
self.download_for_builder(builder).await
}
@@ -312,153 +312,3 @@ impl RemoteStorage for AzureBlobStorage {
Ok(())
}
}
pin_project_lite::pin_project! {
/// Hack to work around not being able to stream once with azure sdk.
///
/// Azure sdk clones streams around with the assumption that they are like
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
/// seekable, but we can also just re-try the request easier.
#[project = NonSeekableStreamProj]
enum NonSeekableStream<S> {
/// A stream wrappers initial form.
///
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
/// clone before first request, then this must be changed.
Initial {
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
len: usize,
},
/// The actually readable variant, produced by cloning the Initial variant.
///
/// The sdk currently always clones once, even without retry policy.
Actual {
#[pin]
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
len: usize,
read_any: bool,
},
/// Most likely unneeded, but left to make life easier, in case more clones are added.
Cloned {
len_was: usize,
}
}
}
impl<S> NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
{
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
use tokio_util::compat::TokioAsyncReadCompatExt;
let inner = tokio_util::io::StreamReader::new(inner).compat();
let inner = Some(inner);
let inner = std::sync::Mutex::new(inner);
NonSeekableStream::Initial { inner, len }
}
}
impl<S> std::fmt::Debug for NonSeekableStream<S> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
}
}
}
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>>,
{
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut [u8],
) -> std::task::Poll<std::io::Result<usize>> {
match self.project() {
NonSeekableStreamProj::Actual {
inner, read_any, ..
} => {
*read_any = true;
inner.poll_read(cx, buf)
}
// NonSeekableStream::Initial does not support reading because it is just much easier
// to have the mutex in place where one does not poll the contents, or that's how it
// seemed originally. If there is a version upgrade which changes the cloning, then
// that support needs to be hacked in.
//
// including {self:?} into the message would be useful, but unsure how to unproject.
_ => std::task::Poll::Ready(Err(std::io::Error::new(
std::io::ErrorKind::Other,
"cloned or initial values cannot be read",
))),
}
}
}
impl<S> Clone for NonSeekableStream<S> {
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
/// request, see type documentation.
fn clone(&self) -> Self {
use NonSeekableStream::*;
match self {
Initial { inner, len } => {
if let Some(inner) = inner.lock().unwrap().take() {
Actual {
inner,
len: *len,
read_any: false,
}
} else {
Self::Cloned { len_was: *len }
}
}
Actual { len, .. } => Cloned { len_was: *len },
Cloned { len_was } => Cloned { len_was: *len_was },
}
}
}
#[async_trait::async_trait]
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
{
async fn reset(&mut self) -> azure_core::error::Result<()> {
use NonSeekableStream::*;
let msg = match self {
Initial { inner, .. } => {
if inner.get_mut().unwrap().is_some() {
return Ok(());
} else {
"reset after first clone is not supported"
}
}
Actual { read_any, .. } if !*read_any => return Ok(()),
Actual { .. } => "reset after reading is not supported",
Cloned { .. } => "reset after second clone is not supported",
};
Err(azure_core::error::Error::new(
azure_core::error::ErrorKind::Io,
std::io::Error::new(std::io::ErrorKind::Other, msg),
))
}
// Note: it is not documented if this should be the total or remaining length, total passes the
// tests.
fn len(&self) -> usize {
use NonSeekableStream::*;
match self {
Initial { len, .. } => *len,
Actual { len, .. } => *len,
Cloned { len_was, .. } => *len_was,
}
}
}

View File

@@ -19,10 +19,8 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
use futures::stream::Stream;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio::{io, sync::Semaphore};
use toml_edit::Item;
use tracing::info;
@@ -181,7 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,
@@ -208,7 +206,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
}
pub struct Download {
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
/// Extra key-value data, associated with the current remote file.
pub metadata: Option<StorageMetadata>,
}
@@ -302,7 +300,7 @@ impl GenericRemoteStorage {
pub async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -400,7 +398,7 @@ impl GenericRemoteStorage {
/// this path is used for the remote object id conversion only.
pub async fn upload_storage_object(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
) -> anyhow::Result<()> {

View File

@@ -7,14 +7,11 @@
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
use anyhow::{bail, ensure, Context};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use futures::stream::Stream;
use tokio::{
fs,
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::io::ReaderStream;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
@@ -102,35 +99,27 @@ impl LocalFs {
};
// If we were given a directory, we may use it as our starting point.
// Otherwise, we must go up to the first ancestor dir that exists. This is because
// Otherwise, we must go up to the parent directory. This is because
// S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
loop {
// Did we make it to the root?
if initial_dir.parent().is_none() {
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
}
match fs::metadata(initial_dir.clone()).await {
Ok(meta) if meta.is_dir() => {
// We found a directory, break
break;
}
Ok(_meta) => {
match fs::metadata(full_path.clone()).await {
Ok(meta) => {
if !meta.is_dir() {
// It's not a directory: strip back to the parent
initial_dir.pop();
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
// Note that Utf8PathBuf starts_with only considers full path segments, but
// object prefixes are arbitrary strings, so we need the strings for doing
// starts_with later.
@@ -222,7 +211,7 @@ impl RemoteStorage for LocalFs {
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -255,12 +244,9 @@ impl RemoteStorage for LocalFs {
);
let from_size_bytes = data_size_bytes as u64;
let data = tokio_util::io::StreamReader::new(data);
let data = std::pin::pin!(data);
let mut buffer_to_read = data.take(from_size_bytes);
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
.await
.with_context(|| {
format!(
@@ -314,7 +300,7 @@ impl RemoteStorage for LocalFs {
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let source = ReaderStream::new(
let source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
@@ -354,14 +340,16 @@ impl RemoteStorage for LocalFs {
}
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let mut source = tokio::fs::OpenOptions::new()
.read(true)
.open(&target_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
})
.map_err(DownloadError::Other)?;
let mut source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
})
.map_err(DownloadError::Other)?,
);
source
.seek(io::SeekFrom::Start(start_inclusive))
.await
@@ -375,13 +363,11 @@ impl RemoteStorage for LocalFs {
Ok(match end_exclusive {
Some(end_exclusive) => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(
source.take(end_exclusive - start_inclusive),
)),
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
},
None => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(source)),
download_stream: Box::pin(source),
},
})
} else {
@@ -481,9 +467,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use bytes::Bytes;
use camino_tempfile::tempdir;
use futures_util::Stream;
use std::{collections::HashMap, io::Write};
async fn read_and_assert_remote_file_contents(
@@ -493,7 +477,7 @@ mod fs_tests {
remote_storage_path: &RemotePath,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
let download = storage
let mut download = storage
.download(remote_storage_path)
.await
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -502,9 +486,13 @@ mod fs_tests {
"Unexpected metadata returned for the downloaded file"
);
let contents = aggregate(download.download_stream).await?;
String::from_utf8(contents).map_err(anyhow::Error::new)
let mut contents = String::new();
download
.download_stream
.read_to_string(&mut contents)
.await
.context("Failed to read remote file contents into string")?;
Ok(contents)
}
#[tokio::test]
@@ -533,26 +521,25 @@ mod fs_tests {
let storage = create_storage()?;
let id = RemotePath::new(Utf8Path::new("dummy"))?;
let content = Bytes::from_static(b"12345");
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
let content = std::io::Cursor::new(b"12345");
// Check that you get an error if the size parameter doesn't match the actual
// size of the stream.
storage
.upload(content(), 0, &id, None)
.upload(Box::new(content.clone()), 0, &id, None)
.await
.expect_err("upload with zero size succeeded");
storage
.upload(content(), 4, &id, None)
.upload(Box::new(content.clone()), 4, &id, None)
.await
.expect_err("upload with too short size succeeded");
storage
.upload(content(), 6, &id, None)
.upload(Box::new(content.clone()), 6, &id, None)
.await
.expect_err("upload with too large size succeeded");
// Correct size is 5, this should succeed.
storage.upload(content(), 5, &id, None).await?;
storage.upload(Box::new(content), 5, &id, None).await?;
Ok(())
}
@@ -600,7 +587,7 @@ mod fs_tests {
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
let first_part_download = storage
let mut first_part_download = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
assert!(
@@ -608,13 +595,21 @@ mod fs_tests {
"No metadata should be returned for no metadata upload"
);
let first_part_remote = aggregate(first_part_download.download_stream).await?;
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut first_part_download.download_stream,
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
first_part_local, first_part_remote,
first_part_local,
first_part_remote.as_slice(),
"First part bytes should be returned when requested"
);
let second_part_download = storage
let mut second_part_download = storage
.download_byte_range(
&upload_target,
first_part_local.len() as u64,
@@ -626,9 +621,17 @@ mod fs_tests {
"No metadata should be returned for no metadata upload"
);
let second_part_remote = aggregate(second_part_download.download_stream).await?;
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut second_part_download.download_stream,
&mut second_part_remote,
)
.await?;
second_part_remote.flush().await?;
let second_part_remote = second_part_remote.into_inner().into_inner();
assert_eq!(
second_part_local, second_part_remote,
second_part_local,
second_part_remote.as_slice(),
"Second part bytes should be returned when requested"
);
@@ -718,10 +721,17 @@ mod fs_tests {
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, _) = uploaded_bytes.split_at(3);
let partial_download_with_metadata = storage
let mut partial_download_with_metadata = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut partial_download_with_metadata.download_stream,
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
first_part_local,
first_part_remote.as_slice(),
@@ -797,16 +807,16 @@ mod fs_tests {
)
})?;
let file = tokio_util::io::ReaderStream::new(file);
storage.upload(file, size, &relative_path, metadata).await?;
storage
.upload(Box::new(file), size, &relative_path, metadata)
.await?;
Ok(relative_path)
}
async fn create_file_for_upload(
path: &Utf8Path,
contents: &str,
) -> anyhow::Result<(fs::File, usize)> {
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
std::fs::create_dir_all(path.parent().unwrap())?;
let mut file_for_writing = std::fs::OpenOptions::new()
.write(true)
@@ -816,7 +826,7 @@ mod fs_tests {
drop(file_for_writing);
let file_size = path.metadata()?.len() as usize;
Ok((
fs::OpenOptions::new().read(true).open(&path).await?,
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
file_size,
))
}
@@ -830,16 +840,4 @@ mod fs_tests {
files.sort_by(|a, b| a.0.cmp(&b.0));
Ok(files)
}
async fn aggregate(
stream: impl Stream<Item = std::io::Result<Bytes>>,
) -> anyhow::Result<Vec<u8>> {
use futures::stream::StreamExt;
let mut out = Vec::new();
let mut stream = std::pin::pin!(stream);
while let Some(res) = stream.next().await {
out.extend_from_slice(&res?[..]);
}
Ok(out)
}
}

View File

@@ -4,14 +4,9 @@
//! allowing multiple api users to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different.
use std::{
borrow::Cow,
pin::Pin,
sync::Arc,
task::{Context, Poll},
};
use std::{borrow::Cow, sync::Arc};
use anyhow::Context as _;
use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
@@ -19,24 +14,23 @@ use aws_config::{
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
BehaviorVersion,
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use bytes::Bytes;
use futures::stream::Stream;
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio::io::{self, AsyncRead};
use tokio_util::io::ReaderStream;
use tracing::debug;
use super::StorageMetadata;
use crate::{
@@ -67,7 +61,7 @@ struct GetObjectRequest {
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
tracing::debug!(
debug!(
"Creating s3 remote storage for S3 bucket {}",
aws_config.bucket_name
);
@@ -84,6 +78,7 @@ impl S3Bucket {
// needed to access remote extensions bucket
.or_else("token", {
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build()
@@ -103,20 +98,18 @@ impl S3Bucket {
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
let mut config_builder = Builder::default()
.behavior_version(BehaviorVersion::v2023_11_09())
let mut config_builder = Config::builder()
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.retry_config(retry_config.build())
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.retry_config(retry_config.build());
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
let client = Client::from_conf(config_builder.build());
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -229,15 +222,12 @@ impl S3Bucket {
match get_object {
Ok(object_output) => {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
let body = object_output.body;
let body = ByteStreamAsStream::from(body);
let body = PermitCarrying::new(permit, body);
let body = TimedDownload::new(started_at, body);
Ok(Download {
metadata,
download_stream: Box::pin(body),
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
started_at,
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
))),
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -250,55 +240,29 @@ impl S3Bucket {
}
}
pin_project_lite::pin_project! {
struct ByteStreamAsStream {
#[pin]
inner: aws_smithy_types::byte_stream::ByteStream
}
}
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
ByteStreamAsStream { inner }
}
}
impl Stream for ByteStreamAsStream {
type Item = std::io::Result<Bytes>;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
// this does the std::io::ErrorKind::Other conversion
self.project().inner.poll_next(cx).map_err(|x| x.into())
}
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
// sense and Stream::size_hint does not really
}
pin_project_lite::pin_project! {
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
struct PermitCarrying<S> {
struct RatelimitedAsyncRead<S> {
permit: tokio::sync::OwnedSemaphorePermit,
#[pin]
inner: S,
}
}
impl<S> PermitCarrying<S> {
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
Self { permit, inner }
RatelimitedAsyncRead { permit, inner }
}
}
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
type Item = <S as Stream>::Item;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
self.project().inner.poll_next(cx)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
this.inner.poll_read(cx, buf)
}
}
@@ -318,7 +282,7 @@ pin_project_lite::pin_project! {
}
}
impl<S> TimedDownload<S> {
impl<S: AsyncRead> TimedDownload<S> {
fn new(started_at: std::time::Instant, inner: S) -> Self {
TimedDownload {
started_at,
@@ -328,26 +292,25 @@ impl<S> TimedDownload<S> {
}
}
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
type Item = <S as Stream>::Item;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
use std::task::ready;
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
let before = buf.filled().len();
let read = std::task::ready!(this.inner.poll_read(cx, buf));
let res = ready!(this.inner.poll_next(cx));
match &res {
Some(Ok(_)) => {}
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
None => *this.outcome = metrics::AttemptOutcome::Ok,
let read_eof = buf.filled().len() == before;
match read {
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
Ok(()) => { /* still in progress */ }
Err(_) => *this.outcome = AttemptOutcome::Err,
}
Poll::Ready(res)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
std::task::Poll::Ready(read)
}
}
@@ -408,11 +371,11 @@ impl RemoteStorage for S3Bucket {
let response = response?;
let keys = response.contents();
let keys = response.contents().unwrap_or_default();
let empty = Vec::new();
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
for object in keys {
let object_path = object.key().expect("response does not contain a key");
@@ -437,7 +400,7 @@ impl RemoteStorage for S3Bucket {
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -447,8 +410,8 @@ impl RemoteStorage for S3Bucket {
let started_at = start_measuring_requests(kind);
let body = Body::wrap_stream(from);
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
let body = Body::wrap_stream(ReaderStream::new(from));
let bytes_stream = ByteStream::new(SdkBody::from(body));
let res = self
.client
@@ -511,7 +474,7 @@ impl RemoteStorage for S3Bucket {
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build()?;
.build();
delete_objects.push(obj_id);
}
@@ -522,11 +485,7 @@ impl RemoteStorage for S3Bucket {
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
.send()
.await;

View File

@@ -1,8 +1,6 @@
//! This module provides a wrapper around a real RemoteStorage implementation that
//! causes the first N attempts at each upload or download operatio to fail. For
//! testing purposes.
use bytes::Bytes;
use futures::stream::Stream;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Mutex;
@@ -110,7 +108,7 @@ impl RemoteStorage for UnreliableWrapper {
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,

View File

@@ -7,9 +7,7 @@ use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use bytes::Bytes;
use camino::Utf8Path;
use futures::stream::Stream;
use once_cell::sync::OnceCell;
use remote_storage::{
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -182,14 +180,23 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None).await?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
let data3 = "remote blob data3".as_bytes();
let data3_len = data3.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
ctx.client.upload(data, len, &path2, None).await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None).await?;
ctx.client
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
@@ -212,56 +219,53 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
let data = "remote blob data here".as_bytes();
let data_len = data.len() as u64;
let (data, len) = wrap_stream(orig.clone());
ctx.client.upload(data, len, &path, None).await?;
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
let mut buf = Vec::new();
tokio::io::copy_buf(
&mut tokio_util::io::StreamReader::new(dl.download_stream),
&mut buf,
)
ctx.client
.upload(std::io::Cursor::new(data), data.len(), &path, None)
.await?;
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
let mut buf = Vec::new();
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
Ok(buf)
}
// Normal download request
let dl = ctx.client.download(&path).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
// Full range (end specified)
let dl = ctx
.client
.download_byte_range(&path, 0, Some(len as u64))
.download_byte_range(&path, 0, Some(data_len))
.await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
// partial range (end specified)
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[4..10]);
assert_eq!(buf, data[4..10]);
// partial range (end beyond real end)
let dl = ctx
.client
.download_byte_range(&path, 8, Some(len as u64 * 100))
.download_byte_range(&path, 8, Some(data_len * 100))
.await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[8..]);
assert_eq!(buf, data[8..]);
// Partial range (end unspecified)
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[4..]);
assert_eq!(buf, data[4..]);
// Full range (end unspecified)
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
debug!("Cleanup: deleting file at path {path:?}");
ctx.client
@@ -277,7 +281,6 @@ fn ensure_logging_ready() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");
});
@@ -500,8 +503,11 @@ async fn upload_azure_data(
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
@@ -582,8 +588,11 @@ async fn upload_simple_azure_data(
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
@@ -612,32 +621,3 @@ async fn upload_simple_azure_data(
ControlFlow::Continue(uploaded_blobs)
}
}
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
// to binary
fn upload_stream(
content: std::borrow::Cow<'static, [u8]>,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
use std::borrow::Cow;
let content = match content {
Cow::Borrowed(x) => Bytes::from_static(x),
Cow::Owned(vec) => Bytes::from(vec),
};
wrap_stream(content)
}
fn wrap_stream(
content: bytes::Bytes,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
let len = content.len();
let content = futures::future::ready(Ok(content));
(futures::stream::once(content), len)
}

View File

@@ -7,9 +7,7 @@ use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use bytes::Bytes;
use camino::Utf8Path;
use futures::stream::Stream;
use once_cell::sync::OnceCell;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -178,14 +176,23 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None).await?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
let data3 = "remote blob data3".as_bytes();
let data3_len = data3.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
ctx.client.upload(data, len, &path2, None).await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None).await?;
ctx.client
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
@@ -203,7 +210,6 @@ fn ensure_logging_ready() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");
});
@@ -425,9 +431,11 @@ async fn upload_s3_data(
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, data_len) =
upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, data_len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
@@ -508,9 +516,11 @@ async fn upload_simple_s3_data(
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, data_len) =
upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, data_len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
@@ -539,30 +549,3 @@ async fn upload_simple_s3_data(
ControlFlow::Continue(uploaded_blobs)
}
}
fn upload_stream(
content: std::borrow::Cow<'static, [u8]>,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
use std::borrow::Cow;
let content = match content {
Cow::Borrowed(x) => Bytes::from_static(x),
Cow::Owned(vec) => Bytes::from(vec),
};
wrap_stream(content)
}
fn wrap_stream(
content: bytes::Bytes,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
let len = content.len();
let content = futures::future::ready(Ok(content));
(futures::stream::once(content), len)
}

View File

@@ -50,8 +50,6 @@ const_format.workspace = true
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
serde_path_to_error.workspace = true
[dev-dependencies]
byteorder.workspace = true
bytes.workspace = true

View File

@@ -1,21 +0,0 @@
#!/bin/bash
# like restore_from_wal.sh, but takes existing initdb.tar.zst
set -euxo pipefail
PG_BIN=$1
WAL_PATH=$2
DATA_DIR=$3
PORT=$4
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
declare -i WAL_SIZE=$REDO_POS+114
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
rm -f 000000010000000000000001

View File

@@ -1,14 +1,16 @@
use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
use std::sync::Arc;
use tokio::sync::{mpsc, Mutex};
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(TaskTrackerToken);
pub struct Completion(mpsc::Sender<()>);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(TaskTracker);
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Default for Barrier {
fn default() -> Self {
@@ -19,7 +21,7 @@ impl Default for Barrier {
impl Barrier {
pub async fn wait(self) {
self.0.wait().await;
self.0.lock().await.recv().await;
}
pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -31,7 +33,8 @@ impl Barrier {
impl PartialEq for Barrier {
fn eq(&self, other: &Self) -> bool {
TaskTracker::ptr_eq(&self.0, &other.0)
// we don't use dyn so this is good
Arc::ptr_eq(&self.0, &other.0)
}
}
@@ -39,10 +42,8 @@ impl Eq for Barrier {}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let tracker = TaskTracker::new();
// otherwise wait never exits
tracker.close();
let token = tracker.token();
(Completion(token), Barrier(tracker))
let (tx, rx) = mpsc::channel::<()>(1);
let rx = Mutex::new(rx);
let rx = Arc::new(rx);
(Completion(tx), Barrier(rx))
}

View File

@@ -152,16 +152,3 @@ impl Debug for Generation {
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn generation_gt() {
// Important that a None generation compares less than a valid one, during upgrades from
// pre-generation systems.
assert!(Generation::none() < Generation::new(0));
assert!(Generation::none() < Generation::new(1));
}
}

View File

@@ -25,12 +25,8 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
if body.remaining() == 0 {
return Ok(None);
}
let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
serde_path_to_error::deserialize(&mut deser)
// intentionally stringify because the debug version is not helpful in python logs
.map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
serde_json::from_reader(body.reader())
.context("Failed to parse json request")
.map(Some)
.map_err(ApiError::BadRequest)
}

View File

@@ -1,7 +1,6 @@
use std::str::FromStr;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, EnumVariantNames};
@@ -25,48 +24,16 @@ impl LogFormat {
}
}
struct TracingEventCountMetric {
error: IntCounter,
warn: IntCounter,
info: IntCounter,
debug: IntCounter,
trace: IntCounter,
}
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
&["level"]
)
.expect("failed to define metric");
TracingEventCountMetric::new(vec)
.expect("failed to define metric")
});
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
debug: vec.with_label_values(&["debug"]),
trace: vec.with_label_values(&["trace"]),
}
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,
tracing::Level::WARN => &self.warn,
tracing::Level::INFO => &self.info,
tracing::Level::DEBUG => &self.debug,
tracing::Level::TRACE => &self.trace,
};
counter.inc();
}
}
struct TracingEventCountLayer(&'static TracingEventCountMetric);
struct TracingEventCountLayer(&'static metrics::IntCounterVec);
impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
where
@@ -77,7 +44,15 @@ where
event: &tracing::Event<'_>,
_ctx: tracing_subscriber::layer::Context<'_, S>,
) {
self.0.inc_for_level(*event.metadata().level());
let level = event.metadata().level();
let level = match *level {
tracing::Level::ERROR => "error",
tracing::Level::WARN => "warn",
tracing::Level::INFO => "info",
tracing::Level::DEBUG => "debug",
tracing::Level::TRACE => "trace",
};
self.0.with_label_values(&[level]).inc();
}
}
@@ -91,17 +66,9 @@ pub enum TracingErrorLayerEnablement {
EnableWithRustLogFilter,
}
/// Where the logging should output to.
#[derive(Clone, Copy)]
pub enum Output {
Stdout,
Stderr,
}
pub fn init(
log_format: LogFormat,
tracing_error_layer_enablement: TracingErrorLayerEnablement,
output: Output,
) -> anyhow::Result<()> {
// We fall back to printing all spans at info-level or above if
// the RUST_LOG environment variable is not set.
@@ -118,12 +85,7 @@ pub fn init(
let log_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_ansi(false)
.with_writer(move || -> Box<dyn std::io::Write> {
match output {
Output::Stdout => Box::new(std::io::stdout()),
Output::Stderr => Box::new(std::io::stderr()),
}
});
.with_writer(std::io::stdout);
let log_layer = match log_format {
LogFormat::Json => log_layer.json().boxed(),
LogFormat::Plain => log_layer.boxed(),
@@ -131,9 +93,7 @@ pub fn init(
};
log_layer.with_filter(rust_log_env_filter())
});
let r = r.with(
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
);
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
match tracing_error_layer_enablement {
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -284,14 +244,14 @@ impl std::fmt::Debug for SecretString {
mod tests {
use metrics::{core::Opts, IntCounterVec};
use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
use super::TracingEventCountLayer;
#[test]
fn tracing_event_count_metric() {
let counter_vec =
IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
let layer = TracingEventCountLayer(metric);
let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
let layer = TracingEventCountLayer(counter_vec);
use tracing_subscriber::prelude::*;
tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {

View File

@@ -1,10 +1,10 @@
//!
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
//! without blocking writers, and allows writing a new value without blocking
//! readers. When you update the value, the new value is immediately visible
//! without blocking writers, and allows writing a new values without blocking
//! readers. When you update the new value, the new value is immediately visible
//! to new readers, but the update waits until all existing readers have
//! finished, so that on return, no one sees the old value anymore.
//! finishe, so that no one sees the old value anymore.
//!
//! This implementation isn't wait-free; it uses an RwLock that is held for a
//! short duration when the value is read or updated.
@@ -26,7 +26,6 @@
//! Increment the value by one, and wait for old readers to finish:
//!
//! ```
//! # async fn dox() {
//! # let rcu = utils::simple_rcu::Rcu::new(1);
//! let write_guard = rcu.lock_for_write();
//!
@@ -37,17 +36,15 @@
//!
//! // Concurrent reads and writes are now possible again. Wait for all the readers
//! // that still observe the old value to finish.
//! waitlist.wait().await;
//! # }
//! waitlist.wait();
//! ```
//!
#![warn(missing_docs)]
use std::ops::Deref;
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
use std::sync::{Arc, Weak};
use std::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::watch;
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
///
/// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -71,21 +68,22 @@ struct RcuCell<V> {
value: V,
/// A dummy channel. We never send anything to this channel. The point is
/// that when the RcuCell is dropped, any subscribed Receivers will be notified
/// that when the RcuCell is dropped, any cloned Senders will be notified
/// that the channel is closed. Updaters can use this to wait out until the
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
///
/// We never send anything to this, we just need to hold onto it so that the
/// Receivers will be notified when it's dropped.
watch: watch::Sender<()>,
/// We never do anything with the receiver, we just need to hold onto it so
/// that the Senders will be notified when it's dropped. But because it's
/// not Sync, we need a Mutex on it.
watch: (SyncSender<()>, Mutex<Receiver<()>>),
}
impl<V> RcuCell<V> {
fn new(value: V) -> Self {
let (watch_sender, _) = watch::channel(());
let (watch_sender, watch_receiver) = sync_channel(0);
RcuCell {
value,
watch: watch_sender,
watch: (watch_sender, Mutex::new(watch_receiver)),
}
}
}
@@ -143,10 +141,10 @@ impl<V> Deref for RcuReadGuard<V> {
///
/// Write guard returned by `write`
///
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
/// held for a short duration!
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
/// it should only be held for a short duration!
///
/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
/// Calling `store` consumes the guard, making new reads and new writes possible
/// again.
///
pub struct RcuWriteGuard<'a, V> {
@@ -181,7 +179,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
// the watches for any that do.
self.inner.old_cells.retain(|weak| {
if let Some(cell) = weak.upgrade() {
watches.push(cell.watch.subscribe());
watches.push(cell.watch.0.clone());
true
} else {
false
@@ -195,20 +193,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
///
/// List of readers who can still see old values.
///
pub struct RcuWaitList(Vec<watch::Receiver<()>>);
pub struct RcuWaitList(Vec<SyncSender<()>>);
impl RcuWaitList {
///
/// Wait for old readers to finish.
///
pub async fn wait(mut self) {
pub fn wait(mut self) {
// after all the old_cells are no longer in use, we're done
for w in self.0.iter_mut() {
// This will block until the Receiver is closed. That happens when
// the RcuCell is dropped.
#[allow(clippy::single_match)]
match w.changed().await {
Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
match w.send(()) {
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
Err(_) => {
// closed, which means that the cell has been dropped, and
// its value is no longer in use
@@ -222,10 +220,11 @@ impl RcuWaitList {
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use std::thread::{sleep, spawn};
use std::time::Duration;
#[tokio::test]
async fn two_writers() {
#[test]
fn two_writers() {
let rcu = Rcu::new(1);
let read1 = rcu.read();
@@ -249,35 +248,33 @@ mod tests {
assert_eq!(*read1, 1);
let log = Arc::new(Mutex::new(Vec::new()));
// Wait for the old readers to finish in separate tasks.
// Wait for the old readers to finish in separate threads.
let log_clone = Arc::clone(&log);
let task2 = tokio::spawn(async move {
wait2.wait().await;
let thread2 = spawn(move || {
wait2.wait();
log_clone.lock().unwrap().push("wait2 done");
});
let log_clone = Arc::clone(&log);
let task3 = tokio::spawn(async move {
wait3.wait().await;
let thread3 = spawn(move || {
wait3.wait();
log_clone.lock().unwrap().push("wait3 done");
});
// without this sleep the test can pass on accident if the writer is slow
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release first reader. This allows first write to finish, but calling
// wait() on the 'task3' would still block.
// wait() on the second one would still block.
log.lock().unwrap().push("dropping read1");
drop(read1);
task2.await.unwrap();
thread2.join().unwrap();
assert!(!task3.is_finished());
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release second reader, and finish second writer.
log.lock().unwrap().push("dropping read2");
drop(read2);
task3.await.unwrap();
thread3.join().unwrap();
assert_eq!(
log.lock().unwrap().as_slice(),

View File

@@ -30,32 +30,18 @@ async fn warn_if_stuck<Fut: std::future::Future>(
let mut fut = std::pin::pin!(fut);
let mut warned = false;
let ret = loop {
loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => break ret,
Ok(ret) => return ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
warned = true;
}
}
};
// If we emitted a warning for slowness, also emit a message when we complete, so that
// someone debugging a shutdown can know for sure whether we have moved past this operation.
if warned {
tracing::info!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, after taking longer than expected"
)
}
ret
}
#[derive(Debug)]

View File

@@ -2,11 +2,8 @@ use std::time::Duration;
use tokio_util::sync::CancellationToken;
#[derive(thiserror::Error, Debug)]
pub enum TimeoutCancellableError {
#[error("Timed out")]
Timeout,
#[error("Cancelled")]
Cancelled,
}

View File

@@ -1,6 +1,3 @@
//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
//! to generate Rust bindings for it.
use std::{env, path::PathBuf, process::Command};
use anyhow::{anyhow, Context};

View File

@@ -1,6 +1,3 @@
//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
//! callback_data stores Box to some Rust implementation.
#![allow(dead_code)]
use std::ffi::CStr;

View File

@@ -436,9 +436,9 @@ mod tests {
event_mask: 0,
}),
expected_messages: vec![
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
// walproposer will panic when it finishes sync_safekeepers
std::panic::catch_unwind(|| wp.start()).unwrap_err();
// validate the resulting LSN
assert_eq!(receiver.try_recv(), Ok(1337));
assert_eq!(receiver.recv()?, 1337);
Ok(())
// drop() will free up resources here
}

View File

@@ -36,7 +36,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }
@@ -52,7 +51,6 @@ regex.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_path_to_error.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }

View File

@@ -3,7 +3,6 @@ use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use pageserver_api::shard::TenantShardId;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
use std::fs::File;
@@ -212,7 +211,7 @@ fn bench_sequential(c: &mut Criterion) {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = PersistentLayerDesc::new_img(
TenantShardId::unsharded(TenantId::generate()),
TenantId::generate(),
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),

View File

@@ -1,14 +0,0 @@
[package]
name = "pageserver_client"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
pageserver_api.workspace = true
thiserror.workspace = true
async-trait.workspace = true
reqwest.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1 +0,0 @@
pub mod mgmt_api;

View File

@@ -1,188 +0,0 @@
use pageserver_api::models::*;
use reqwest::{IntoUrl, Method};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
};
#[derive(Debug)]
pub struct Client {
mgmt_api_endpoint: String,
authorization_header: Option<String>,
client: reqwest::Client,
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("receive body: {0}")]
ReceiveBody(reqwest::Error),
#[error("receive error body: {0}")]
ReceiveErrorBody(String),
#[error("pageserver API: {0}")]
ApiError(String),
}
pub type Result<T> = std::result::Result<T, Error>;
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
async fn error_from_body(self) -> Result<Self>;
}
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(mut self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
let url = self.url().to_owned();
Err(match self.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
}
})
}
}
impl Client {
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self {
mgmt_api_endpoint,
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
client: reqwest::Client::new(),
}
}
pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
let resp = self.get(&uri).await?;
resp.json().await.map_err(Error::ReceiveBody)
}
pub async fn list_timelines(
&self,
tenant_id: TenantId,
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn timeline_info(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn keyspace(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint
);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
self.request(Method::GET, uri, ()).await
}
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
body: B,
) -> Result<reqwest::Response> {
let req = self.client.request(method, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
let response = res.error_from_body().await?;
Ok(response)
}
pub async fn status(&self) -> Result<()> {
let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
self.get(&uri).await?;
Ok(())
}
pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
self.request(Method::POST, &uri, req)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
Ok(())
}
pub async fn location_config(
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<std::time::Duration>,
) -> Result<()> {
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/v1/tenant/{}/location_config",
self.mgmt_api_endpoint, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.request(Method::PUT, &path, &req_body).await?;
Ok(())
}
pub async fn timeline_create(
&self,
tenant_id: TenantId,
req: &TimelineCreateRequest,
) -> Result<TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{}/timeline",
self.mgmt_api_endpoint, tenant_id
);
self.request(Method::POST, &uri, req)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -18,5 +18,3 @@ tokio.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -1,38 +0,0 @@
use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
#[derive(clap::Subcommand)]
pub(crate) enum IndexPartCmd {
Dump { path: Utf8PathBuf },
}
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
match cmd {
IndexPartCmd::Dump { path } => {
let bytes = tokio::fs::read(path).await.context("read file")?;
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}
let output = Output {
layer_metadata: &des.layer_metadata,
disk_consistent_lsn: des.get_disk_consistent_lsn(),
timeline_metadata: &des.metadata,
};
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
println!("{output}");
Ok(())
}
}
}

View File

@@ -1,15 +1,13 @@
use std::path::{Path, PathBuf};
use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8Path;
use clap::Subcommand;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::{page_cache, virtual_file};
use pageserver::{
@@ -22,7 +20,6 @@ use pageserver::{
};
use std::fs;
use utils::bin_ser::BeSer;
use utils::id::{TenantId, TimelineId};
use crate::layer_map_analyzer::parse_filename;
@@ -48,13 +45,6 @@ pub(crate) enum LayerCmd {
/// The id from list-layer command
id: usize,
},
RewriteSummary {
layer_file_path: Utf8PathBuf,
#[clap(long)]
new_tenant_id: Option<TenantId>,
#[clap(long)]
new_timeline_id: Option<TimelineId>,
},
}
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
@@ -110,7 +100,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
println!("- timeline {}", timeline.file_name().to_string_lossy());
}
}
Ok(())
}
LayerCmd::ListLayer {
path,
@@ -139,7 +128,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
idx += 1;
}
}
Ok(())
}
LayerCmd::DumpLayer {
path,
@@ -180,63 +168,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
idx += 1;
}
}
Ok(())
}
LayerCmd::RewriteSummary {
layer_file_path,
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
macro_rules! rewrite_closure {
($($summary_ty:tt)*) => {{
|summary| $($summary_ty)* {
tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
..summary
}
}};
}
let res = ImageLayer::rewrite_summary(
layer_file_path,
rewrite_closure!(image_layer::Summary),
&ctx,
)
.await;
match res {
Ok(()) => {
println!("Successfully rewrote summary of image layer {layer_file_path}");
return Ok(());
}
Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
Err(image_layer::RewriteSummaryError::Other(e)) => {
return Err(e);
}
}
let res = DeltaLayer::rewrite_summary(
layer_file_path,
rewrite_closure!(delta_layer::Summary),
&ctx,
)
.await;
match res {
Ok(()) => {
println!("Successfully rewrote summary of delta layer {layer_file_path}");
return Ok(());
}
Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
Err(delta_layer::RewriteSummaryError::Other(e)) => {
return Err(e);
}
}
anyhow::bail!("not an image or delta layer: {layer_file_path}");
}
}
Ok(())
}

View File

@@ -5,13 +5,11 @@
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
mod draw_timeline_dir;
mod index_part;
mod layer_map_analyzer;
mod layers;
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use index_part::IndexPartCmd;
use layers::LayerCmd;
use pageserver::{
context::{DownloadBehavior, RequestContext},
@@ -40,8 +38,6 @@ struct CliOpts {
#[derive(Subcommand)]
enum Commands {
Metadata(MetadataCmd),
#[command(subcommand)]
IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -87,9 +83,6 @@ async fn main() -> anyhow::Result<()> {
Commands::Metadata(cmd) => {
handle_metadata(&cmd)?;
}
Commands::IndexPart(cmd) => {
index_part::main(&cmd).await?;
}
Commands::DrawTimeline {} => {
draw_timeline_dir::main()?;
}

View File

@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::tenant::TenantSharedResources;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -103,11 +103,7 @@ fn main() -> anyhow::Result<()> {
} else {
TracingErrorLayerEnablement::Disabled
};
logging::init(
conf.log_format,
tracing_error_layer_enablement,
logging::Output::Stdout,
)?;
logging::init(conf.log_format, tracing_error_layer_enablement)?;
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
// disarming this hook on pageserver, because we never tear down tracing.
@@ -402,11 +398,15 @@ fn start_pageserver(
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
let (init_done_tx, init_done_rx) = utils::completion::channel();
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
let order = pageserver::InitializationOrder {
initial_tenant_load_remote: Some(init_done_tx),
initial_tenant_load: Some(init_remote_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: Some(init_logical_size_done_tx),
background_jobs_can_start: background_jobs_barrier.clone(),
};
@@ -425,6 +425,7 @@ fn start_pageserver(
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -459,7 +460,7 @@ fn start_pageserver(
});
let WaitForPhaseResult {
timeout_remaining: _timeout,
timeout_remaining: timeout,
skipped: init_load_skipped,
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
@@ -467,6 +468,26 @@ fn start_pageserver(
scopeguard::ScopeGuard::into_inner(guard);
let guard = scopeguard::guard_on_success((), |_| {
tracing::info!("Cancelled before initial logical sizes completed")
});
let logical_sizes_done = std::pin::pin!(async {
init_logical_size_done_rx.wait().await;
startup_checkpoint(
started_startup_at,
"initial_logical_sizes",
"Initial logical sizes completed",
);
});
let WaitForPhaseResult {
timeout_remaining: _,
skipped: logical_sizes_skipped,
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
scopeguard::ScopeGuard::into_inner(guard);
// allow background jobs to start: we either completed prior stages, or they reached timeout
// and were skipped. It is important that we do not let them block background jobs indefinitely,
// because things like consumption metrics for billing are blocked by this barrier.
@@ -489,6 +510,9 @@ fn start_pageserver(
if let Some(f) = init_load_skipped {
f.await;
}
if let Some(f) = logical_sizes_skipped {
f.await;
}
scopeguard::ScopeGuard::into_inner(guard);
startup_checkpoint(started_startup_at, "complete", "Startup complete");
@@ -504,17 +528,6 @@ fn start_pageserver(
}
});
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
@@ -544,7 +557,6 @@ fn start_pageserver(
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
)
.context("Failed to initialize router state")?,
);
@@ -571,6 +583,7 @@ fn start_pageserver(
}
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let background_jobs_barrier = background_jobs_barrier;
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.
@@ -608,7 +621,6 @@ fn start_pageserver(
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))

View File

@@ -5,7 +5,6 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::shard::TenantShardId;
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde::de::IntoDeserializer;
use std::env;
@@ -26,7 +25,7 @@ use toml_edit::{Document, Item};
use camino::{Utf8Path, Utf8PathBuf};
use postgres_backend::AuthType;
use utils::{
id::{NodeId, TimelineId},
id::{NodeId, TenantId, TimelineId},
logging::LogFormat,
};
@@ -41,8 +40,6 @@ use crate::{
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
pub mod defaults {
use crate::tenant::config::defaults::*;
use const_format::formatcp;
@@ -63,8 +60,6 @@ pub mod defaults {
pub const DEFAULT_LOG_FORMAT: &str = "plain";
pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
@@ -74,8 +69,6 @@ pub mod defaults {
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
///
/// Default built-in configuration file.
///
@@ -98,7 +91,6 @@ pub mod defaults {
#log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -124,8 +116,6 @@ pub mod defaults {
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
[remote_storage]
"#
@@ -185,11 +175,6 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
/// loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -229,10 +214,6 @@ pub struct PageServerConf {
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -293,7 +274,6 @@ struct PageServerConfigBuilder {
log_format: BuilderValue<LogFormat>,
concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
metric_collection_interval: BuilderValue<Duration>,
@@ -312,8 +292,6 @@ struct PageServerConfigBuilder {
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
}
impl Default for PageServerConfigBuilder {
@@ -351,8 +329,6 @@ impl Default for PageServerConfigBuilder {
.expect("cannot parse default keepalive interval")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant")),
concurrent_tenant_size_logical_size_queries: Set(
ConfigurableSemaphore::DEFAULT_INITIAL,
),
@@ -384,8 +360,6 @@ impl Default for PageServerConfigBuilder {
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
}
}
}
@@ -466,10 +440,6 @@ impl PageServerConfigBuilder {
self.log_format = BuilderValue::Set(log_format)
}
pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
self.concurrent_tenant_warmup = BuilderValue::Set(u);
}
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
@@ -530,14 +500,7 @@ impl PageServerConfigBuilder {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
}
pub fn heatmap_upload_concurrency(&mut self, value: usize) {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
@@ -590,7 +553,6 @@ impl PageServerConfigBuilder {
.broker_keepalive_interval
.ok_or(anyhow!("No broker keepalive interval provided"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
concurrent_tenant_size_logical_size_queries,
),
@@ -632,10 +594,6 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
})
}
}
@@ -670,13 +628,12 @@ impl PageServerConf {
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenants_path().join(tenant_shard_id.to_string())
pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(IGNORED_TENANT_FILE_NAME)
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
@@ -684,53 +641,47 @@ impl PageServerConf {
///
/// Legacy: superseded by tenant_location_config_path. Eventually
/// remove this function.
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
}
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_LOCATION_CONFIG_NAME)
}
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TIMELINES_SEGMENT_NAME)
pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
}
pub fn timeline_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.timelines_path(tenant_shard_id)
.join(timeline_id.to_string())
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
self.timelines_path(tenant_id).join(timeline_id.to_string())
}
pub fn timeline_uninit_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_UNINIT_MARK_SUFFIX,
)
}
pub fn timeline_delete_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_DELETED_MARKER_FILE_NAME)
}
@@ -740,24 +691,20 @@ impl PageServerConf {
pub fn trace_path(
&self,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> Utf8PathBuf {
self.traces_path()
.join(tenant_shard_id.to_string())
.join(tenant_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.timeline_path(tenant_shard_id, timeline_id)
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
self.timeline_path(tenant_id, timeline_id)
.join(METADATA_FILE_NAME)
}
@@ -820,7 +767,7 @@ impl PageServerConf {
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
}
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
t_conf = Self::parse_toml_tenant_conf(item)?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
@@ -828,11 +775,6 @@ impl PageServerConf {
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
"concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
}),
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -874,9 +816,7 @@ impl PageServerConf {
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
_ => bail!("unrecognized pageserver option '{key}'"),
}
@@ -901,10 +841,114 @@ impl PageServerConf {
Ok(conf)
}
// subroutine of parse_and_validate to parse `[tenant_conf]` section
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
let mut t_conf: TenantConfOpt = Default::default();
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
t_conf.checkpoint_distance =
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
}
if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
t_conf.checkpoint_timeout = Some(parse_toml_duration(
"checkpoint_timeout",
checkpoint_timeout,
)?);
}
if let Some(compaction_target_size) = item.get("compaction_target_size") {
t_conf.compaction_target_size = Some(parse_toml_u64(
"compaction_target_size",
compaction_target_size,
)?);
}
if let Some(compaction_period) = item.get("compaction_period") {
t_conf.compaction_period =
Some(parse_toml_duration("compaction_period", compaction_period)?);
}
if let Some(compaction_threshold) = item.get("compaction_threshold") {
t_conf.compaction_threshold =
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
}
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
t_conf.image_creation_threshold = Some(
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
);
}
if let Some(gc_horizon) = item.get("gc_horizon") {
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
}
if let Some(gc_period) = item.get("gc_period") {
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
}
if let Some(pitr_interval) = item.get("pitr_interval") {
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
}
if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
"walreceiver_connect_timeout",
walreceiver_connect_timeout,
)?);
}
if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
t_conf.lagging_wal_timeout = Some(parse_toml_duration(
"lagging_wal_timeout",
lagging_wal_timeout,
)?);
}
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag =
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =
Some(trace_read_requests.as_bool().with_context(|| {
"configure option trace_read_requests is not a bool".to_string()
})?);
}
if let Some(eviction_policy) = item.get("eviction_policy") {
t_conf.eviction_policy = Some(
deserialize_from_item("eviction_policy", eviction_policy)
.context("parse eviction_policy")?,
);
}
if let Some(item) = item.get("min_resident_size_override") {
t_conf.min_resident_size_override = Some(
deserialize_from_item("min_resident_size_override", item)
.context("parse min_resident_size_override")?,
);
}
if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
"evictions_low_residence_duration_metric_threshold",
item,
)?);
}
if let Some(gc_feedback) = item.get("gc_feedback") {
t_conf.gc_feedback = Some(
gc_feedback
.as_bool()
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
);
}
Ok(t_conf)
}
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -930,10 +974,6 @@ impl PageServerConf {
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5000),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant"),
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
),
@@ -948,7 +988,6 @@ impl PageServerConf {
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
}
}
}
@@ -1152,9 +1191,6 @@ background_task_maximum_delay = '334 s'
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
)?,
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
@@ -1176,8 +1212,7 @@ background_task_maximum_delay = '334 s'
)?,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
control_plane_emergency_mode: false
},
"Correct defaults should be used when no config values are provided"
);
@@ -1221,9 +1256,6 @@ background_task_maximum_delay = '334 s'
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5),
log_format: LogFormat::Json,
concurrent_tenant_warmup: ConfigurableSemaphore::new(
NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
@@ -1237,8 +1269,7 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
control_plane_emergency_mode: false
},
"Should be able to parse all basic config values correctly"
);
@@ -1386,37 +1417,6 @@ trace_read_requests = {trace_read_requests}"#,
Ok(())
}
#[test]
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
let config_string = r#"
[tenant_config]
checkpoint_distance = -1 # supposed to be an u64
"#
.to_string();
let toml: Document = config_string.parse()?;
let item = toml.get("tenant_config").unwrap();
let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
assert_eq!(error.to_string(), expected_error_str);
Ok(())
}
#[test]
fn parse_override_tenant_config() -> anyhow::Result<()> {
let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string();
let toml: Document = config_string.parse()?;
let item = toml.get("tenant_config").unwrap();
let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
assert_eq!(conf.min_resident_size_override, Some(400));
Ok(())
}
#[test]
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
let tempdir = tempdir()?;

View File

@@ -3,7 +3,7 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -12,7 +12,6 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::NodeId;
@@ -38,7 +37,6 @@ type RawMetric = (MetricsKey, (EventType, u64));
type Cache = HashMap<MetricsKey, (EventType, u64)>;
/// Main thread that serves metrics collection
#[allow(clippy::too_many_arguments)]
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_interval: Duration,
@@ -46,7 +44,6 @@ pub async fn collect_metrics(
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: Utf8PathBuf,
cancel: CancellationToken,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
@@ -66,13 +63,9 @@ pub async fn collect_metrics(
"synthetic size calculation",
false,
async move {
calculate_synthetic_size_worker(
synthetic_size_calculation_interval,
&cancel,
&worker_ctx,
)
.instrument(info_span!("synthetic_size_worker"))
.await?;
calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
},
);
@@ -248,7 +241,6 @@ async fn reschedule(
/// Caclculate synthetic size for each active tenant
async fn calculate_synthetic_size_worker(
synthetic_size_calculation_interval: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
@@ -256,6 +248,8 @@ async fn calculate_synthetic_size_worker(
info!("calculate_synthetic_size_worker stopped");
};
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
loop {
let started_at = Instant::now();
@@ -267,25 +261,21 @@ async fn calculate_synthetic_size_worker(
}
};
for (tenant_shard_id, tenant_state) in tenants {
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards.
continue;
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
}
}
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
continue;
};
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
calculate_and_log(&tenant, cancel, ctx).await;
}
crate::tenant::tasks::warn_when_period_overrun(
@@ -296,7 +286,7 @@ async fn calculate_synthetic_size_worker(
let res = tokio::time::timeout_at(
started_at + synthetic_size_calculation_interval,
cancel.cancelled(),
task_mgr::shutdown_token().cancelled(),
)
.await;
if res.is_ok() {
@@ -304,31 +294,3 @@ async fn calculate_synthetic_size_worker(
}
}
}
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
const CAUSE: LogicalSizeCalculationCause =
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
return;
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
}

View File

@@ -1,4 +1,5 @@
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
use crate::context::RequestContext;
use anyhow::Context;
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
@@ -197,12 +198,12 @@ pub(super) async fn collect_all_metrics(
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
if state != TenantState::Active || !id.is_zero() {
if state != TenantState::Active {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.ok()
.map(|tenant| (id.tenant_id, tenant))
.map(|tenant| (id, tenant))
}
});
@@ -350,17 +351,14 @@ impl TimelineSnapshot {
let last_record_lsn = t.get_last_record_lsn();
let current_exact_logical_size = {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
let size = span.in_scope(|| {
t.get_current_logical_size(
crate::tenant::timeline::GetLogicalSizePriority::Background,
ctx,
)
});
match size {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
let res = span
.in_scope(|| t.get_current_logical_size(ctx))
.context("get_current_logical_size");
match res? {
// Only send timeline logical size when it is fully calculated.
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
CurrentLogicalSize::Approximate(_) => None,
(size, is_exact) if is_exact => Some(size),
(_, _) => None,
}
};

View File

@@ -1,15 +1,16 @@
use std::collections::HashMap;
use pageserver_api::{
control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
},
shard::TenantShardId,
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
};
use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, generation::Generation, id::NodeId};
use utils::{
backoff,
generation::Generation,
id::{NodeId, TenantId},
};
use crate::config::PageServerConf;
@@ -30,11 +31,11 @@ pub enum RetryForeverError {
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
@@ -126,7 +127,7 @@ impl ControlPlaneClient {
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -153,8 +154,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("validate")

View File

@@ -10,12 +10,11 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::virtual_file::MaybeFatalIo;
use crate::virtual_file::VirtualFile;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use hex::FromHex;
use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
@@ -26,7 +25,7 @@ use tracing::Instrument;
use tracing::{self, debug, error};
use utils::crashsafe::path_with_suffix_extension;
use utils::generation::Generation;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use utils::lsn::AtomicLsn;
use utils::lsn::Lsn;
@@ -160,10 +159,11 @@ pub struct DeletionQueueClient {
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize)]
struct TenantDeletionList {
/// For each Timeline, a list of key fragments to append to the timeline remote path
/// when reconstructing a full key
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
timelines: HashMap<TimelineId, Vec<String>>,
/// The generation in which this deletion was emitted: note that this may not be the
@@ -178,11 +178,43 @@ impl TenantDeletionList {
}
}
/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
V: Serialize,
I: AsRef<[u8]>,
{
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
transformed
.collect::<HashMap<String, &V>>()
.serialize(serializer)
}
/// For HashMaps using a FromHex key, where we would like to decode the key
fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
where
D: serde::de::Deserializer<'de>,
V: Deserialize<'de>,
I: FromHex + std::hash::Hash + Eq,
{
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
hex_map
.into_iter()
.map(|(k, v)| {
I::from_hex(k)
.map(|k| (k, v))
.map_err(|_| serde::de::Error::custom("Invalid hex ID"))
})
.collect()
}
/// Files ending with this suffix will be ignored and erased
/// during recovery as startup.
const TEMP_SUFFIX: &str = "tmp";
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionList {
/// Serialization version, for future use
version: u8,
@@ -194,7 +226,8 @@ struct DeletionList {
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
/// with one unique generation ID: if someone tries to push a second generation
/// ID for the same tenant, we will start a new DeletionList.
tenants: HashMap<TenantShardId, TenantDeletionList>,
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
tenants: HashMap<TenantId, TenantDeletionList>,
/// Avoid having to walk `tenants` to calculate the number of keys in
/// the nested deletion lists
@@ -266,7 +299,7 @@ impl DeletionList {
/// deletion list.
fn push(
&mut self,
tenant: &TenantShardId,
tenant: &TenantId,
timeline: &TimelineId,
generation: Generation,
objects: &mut Vec<RemotePath>,
@@ -358,7 +391,7 @@ struct TenantLsnState {
#[derive(Default)]
struct VisibleLsnUpdates {
tenants: HashMap<TenantShardId, TenantLsnState>,
tenants: HashMap<TenantId, TenantLsnState>,
}
impl VisibleLsnUpdates {
@@ -415,7 +448,7 @@ impl DeletionQueueClient {
pub(crate) fn recover(
&self,
attached_tenants: HashMap<TenantShardId, Generation>,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), DeletionQueueError> {
self.do_push(
&self.tx,
@@ -432,7 +465,7 @@ impl DeletionQueueClient {
/// backend will later wake up and notice that the tenant's generation requires validation.
pub(crate) async fn update_remote_consistent_lsn(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
lsn: Lsn,
@@ -443,13 +476,10 @@ impl DeletionQueueClient {
.write()
.expect("Lock should never be poisoned");
let tenant_entry = locked
.tenants
.entry(tenant_shard_id)
.or_insert(TenantLsnState {
timelines: HashMap::new(),
generation: current_generation,
});
let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
timelines: HashMap::new(),
generation: current_generation,
});
if tenant_entry.generation != current_generation {
// Generation might have changed if we were detached and then re-attached: in this case,
@@ -476,29 +506,27 @@ impl DeletionQueueClient {
/// generations in `layers` are the generations in which those layers were written.
pub(crate) async fn push_layers(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> Result<(), DeletionQueueError> {
if current_generation.is_none() {
debug!("Enqueuing deletions in legacy mode, skipping queue");
let mut layer_paths = Vec::new();
for (layer, meta) in layers {
for (layer, generation) in layers {
layer_paths.push(remote_layer_path(
&tenant_shard_id.tenant_id,
&tenant_id,
&timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}
self.push_immediate(layer_paths).await?;
return self.flush_immediate().await;
}
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
}
/// When a Tenant has a generation, push_layers is always synchronous because
@@ -508,10 +536,10 @@ impl DeletionQueueClient {
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
pub(crate) fn push_layers_sync(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
@@ -519,7 +547,7 @@ impl DeletionQueueClient {
self.do_push(
&self.tx,
ListWriterQueueMessage::Delete(DeletionOp {
tenant_shard_id,
tenant_id,
timeline_id,
layers,
generation: current_generation,
@@ -722,7 +750,6 @@ impl DeletionQueue {
mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::shard::ShardIndex;
use std::{io::ErrorKind, time::Duration};
use tracing::info;
@@ -787,12 +814,12 @@ mod test {
}
fn set_latest_generation(&self, gen: Generation) {
let tenant_shard_id = self.harness.tenant_shard_id;
let tenant_id = self.harness.tenant_id;
self.mock_control_plane
.latest_generation
.lock()
.unwrap()
.insert(tenant_shard_id, gen);
.insert(tenant_id, gen);
}
/// Returns remote layer file name, suitable for use in assert_remote_files
@@ -801,8 +828,8 @@ mod test {
file_name: LayerFileName,
gen: Generation,
) -> anyhow::Result<String> {
let tenant_shard_id = self.harness.tenant_shard_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let tenant_id = self.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
std::fs::create_dir_all(&remote_timeline_path)?;
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
@@ -820,7 +847,7 @@ mod test {
#[derive(Debug, Clone)]
struct MockControlPlane {
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
}
impl MockControlPlane {
@@ -834,20 +861,20 @@ mod test {
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for MockControlPlane {
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
unimplemented!()
}
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let mut result = HashMap::new();
let latest_generation = self.latest_generation.lock().unwrap();
for (tenant_shard_id, generation) in tenants {
if let Some(latest) = latest_generation.get(&tenant_shard_id) {
result.insert(tenant_shard_id, *latest == generation);
for (tenant_id, generation) in tenants {
if let Some(latest) = latest_generation.get(&tenant_id) {
result.insert(tenant_id, *latest == generation);
}
}
@@ -951,10 +978,10 @@ mod test {
client.recover(HashMap::new())?;
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_shard_id = ctx.harness.tenant_shard_id;
let tenant_id = ctx.harness.tenant_id;
let content: Vec<u8> = "victim1 contents".into();
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
@@ -962,8 +989,6 @@ mod test {
// we delete, and the generation of the running Tenant.
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
let remote_layer_file_name_1 =
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
@@ -984,10 +1009,10 @@ mod test {
info!("Pushing");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
)
.await?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
@@ -1026,13 +1051,11 @@ mod test {
let stale_generation = latest_generation.previous();
// Generation that our example layer file was written with
let layer_generation = stale_generation.previous();
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
ctx.set_latest_generation(latest_generation);
let tenant_shard_id = ctx.harness.tenant_shard_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let tenant_id = ctx.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
// Initial state: a remote layer exists
@@ -1042,10 +1065,10 @@ mod test {
tracing::debug!("Pushing...");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
stale_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1057,10 +1080,10 @@ mod test {
tracing::debug!("Pushing...");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
latest_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1079,16 +1102,14 @@ mod test {
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
let tenant_shard_id = ctx.harness.tenant_shard_id;
let tenant_id = ctx.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
// Inject a deletion in the generation before generation_now: after restart,
// this deletion should _not_ get executed (only the immediately previous
@@ -1097,10 +1118,10 @@ mod test {
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation.previous(),
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1111,10 +1132,10 @@ mod test {
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation,
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1142,7 +1163,7 @@ mod test {
drop(client);
ctx.restart().await;
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
client.recover(HashMap::from([(tenant_id, now_generation)]))?;
info!("Flush-executing");
client.flush_execute().await?;
@@ -1204,13 +1225,12 @@ pub(crate) mod mock {
match msg {
ListWriterQueueMessage::Delete(op) => {
let mut objects = op.objects;
for (layer, meta) in op.layers {
for (layer, generation) in op.layers {
objects.push(remote_layer_path(
&op.tenant_shard_id.tenant_id,
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}
@@ -1290,34 +1310,4 @@ pub(crate) mod mock {
}
}
}
/// Test round-trip serialization/deserialization, and test stability of the format
/// vs. a static expected string for the serialized version.
#[test]
fn deletion_list_serialization() -> anyhow::Result<()> {
let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
.to_string()
.parse::<TenantShardId>()?;
let timeline_id = "be322c834ed9e709e63b5c9698691910"
.to_string()
.parse::<TimelineId>()?;
let generation = Generation::new(123);
let object =
RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
let mut objects = [object].to_vec();
let mut example = DeletionList::new(1);
example.push(&tenant_id, &timeline_id, generation, &mut objects);
let encoded = serde_json::to_string(&example)?;
let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
assert_eq!(encoded, expected);
let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
assert_eq!(example, decoded);
Ok(())
}
}

View File

@@ -19,7 +19,6 @@ use std::collections::HashMap;
use std::fs::create_dir_all;
use std::time::Duration;
use pageserver_api::shard::TenantShardId;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
@@ -27,13 +26,13 @@ use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
@@ -54,22 +53,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_shard_id: TenantShardId,
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) struct RecoverOp {
pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
pub(super) attached_tenants: HashMap<TenantId, Generation>,
}
#[derive(Debug)]
@@ -206,7 +205,7 @@ impl ListWriter {
async fn recover(
&mut self,
attached_tenants: HashMap<TenantShardId, Generation>,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), anyhow::Error> {
debug!(
"recovering with {} attached tenants",
@@ -309,21 +308,10 @@ impl ListWriter {
// generation was issued to another node in the interval while we restarted,
// then we may treat deletion lists from the previous generation as if they
// belong to our currently attached generation, and proceed to validate & execute.
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
if attached_gen.previous() == tenant_list.generation {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Updating gen on recovered list");
tenant_list.generation = *attached_gen;
} else {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Encountered stale generation on recovered list");
}
}
}
@@ -399,26 +387,25 @@ impl ListWriter {
);
let mut layer_paths = Vec::new();
for (layer, meta) in op.layers {
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_shard_id.tenant_id,
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}
layer_paths.extend(op.objects);
if !self.pending.push(
&op.tenant_shard_id,
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) {
self.flush().await;
let retry_succeeded = self.pending.push(
&op.tenant_shard_id,
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,

View File

@@ -178,14 +178,7 @@ where
.unwrap_or(false);
if valid && *validated_generation == tenant_lsn_state.generation {
for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
tracing::debug!(
%tenant_id,
%timeline_id,
current = %pending_lsn.result_slot.load(),
projected = %pending_lsn.projected,
"advancing validated remote_consistent_lsn",
);
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
pending_lsn.result_slot.store(pending_lsn.projected);
}
} else {

View File

@@ -42,6 +42,7 @@
// reading these fields. We use the Debug impl for semi-structured logging, though.
use std::{
collections::HashMap,
sync::Arc,
time::{Duration, SystemTime},
};
@@ -124,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
_storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: CancellationToken,
) {
@@ -148,14 +149,8 @@ async fn disk_usage_eviction_task(
let start = Instant::now();
async {
let res = disk_usage_eviction_task_iteration(
state,
task_config,
storage,
tenants_dir,
&cancel,
)
.await;
let res =
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
match res {
Ok(()) => {}
@@ -186,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -274,9 +268,8 @@ struct LayerCount {
count: usize,
}
pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
@@ -317,7 +310,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
.unwrap()
.as_micros(),
partition,
desc.tenant_shard_id,
desc.tenant_id,
desc.timeline_id,
candidate.layer,
);
@@ -328,16 +321,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Walk through the list of candidates, until we have accumulated enough layers to get
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
// how much disk space would be used after evicting all the layers up to the current
// point in the list.
// point in the list. The layers are collected in 'batched', grouped per timeline.
//
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let mut max_batch_size = 0;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
@@ -346,13 +339,25 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
break;
}
if partition == &MinResidentSizePartition::Below && warned.is_none() {
if partition == MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
evicted_amount += 1;
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
// tasks to evict all seen layers until we have evicted enough
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
}
let usage_planned = match warned {
@@ -367,79 +372,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
};
debug!(?usage_planned, "usage planned");
// phase2: evict layers
// phase2: evict victims batched by timeline
let mut js = tokio::task::JoinSet::new();
let limit = 1000;
let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
let mut consumed_all = false;
// ratelimit to 1k files or any higher max batch size
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
for (timeline, batch) in batched {
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
let batch_size =
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
let evict_layers = async move {
loop {
let next = if js.len() >= limit || consumed_all {
js.join_next().await
} else if !js.is_empty() {
// opportunistically consume ready result, one per each new evicted
futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
} else {
None
};
// I dislike naming of `available_permits` but it means current total amount of permits
// because permits can be added
assert!(batch_size as usize <= limit.available_permits());
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
usage_assumed.add_available_bytes(file_size);
debug!(%timeline_id, "evicting batch for timeline");
let evict = {
let limit = limit.clone();
let cancel = cancel.clone();
async move {
let mut evicted_bytes = 0;
let mut evictions_failed = LayerCount::default();
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
// semaphore closing means cancelled
return (evicted_bytes, evictions_failed);
};
let results = timeline.evict_layers(&batch).await;
match results {
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
let file_size = layer.layer_desc().file_size;
match result {
Some(Ok(())) => {
evicted_bytes += file_size;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
}
}
}
}
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
Err(e) => {
warn!("failed to evict batch: {:#}", e);
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
(evicted_bytes, evictions_failed)
}
if consumed_all && js.is_empty() {
break;
}
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
consumed_all = true;
continue;
};
js.spawn(async move {
let rtc = candidate.timeline.remote_client.as_ref().expect(
"holding the witness, all timelines must have a remote timeline client",
);
let file_size = candidate.layer.layer_desc().file_size;
candidate
.layer
.evict_and_wait(rtc)
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
});
tokio::task::yield_now().await;
}
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
js.spawn(evict);
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
// chance of making progress
tokio::task::yield_now().await;
}
let join_all = async move {
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
while let Some(res) = js.join_next().await {
match res {
Ok((evicted_bytes, failed)) => {
usage_assumed.add_available_bytes(evicted_bytes);
evictions_failed.file_sizes += failed.file_sizes;
evictions_failed.count += failed.count;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
(usage_assumed, evictions_failed)
};
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = evict_layers => { tuple },
tuple = join_all => { tuple },
_ = cancel.cancelled() => {
// dropping joinset will abort all pending evict_and_waits and that is fine, our
// requests will still stand
// close the semaphore to stop any pending acquires
limit.close();
return Ok(IterationOutcome::Cancelled);
}
};
@@ -546,7 +572,7 @@ async fn collect_eviction_candidates(
continue;
}
let info = tl.get_local_layers_for_disk_usage_eviction().await;
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(
info.resident_layers
.into_iter()

View File

@@ -1,2 +1,4 @@
pub mod routes;
pub use routes::make_router;
pub use pageserver_api::models;

View File

@@ -84,6 +84,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get tenant status
responses:
@@ -180,6 +181,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get timelines for tenant
responses:
@@ -230,6 +232,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -335,6 +338,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -397,6 +401,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -464,6 +469,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -517,6 +523,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules attach operation to happen in the background for the given tenant.
@@ -617,98 +624,6 @@ paths:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/location_config:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
- name: flush_ms
in: query
required: false
schema:
type: integer
put:
description: |
Configures a _tenant location_, that is how a particular pageserver handles
a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL
and page service requests, and _secondary_ tenants, i.e. those which are just keeping
a warm cache in anticipation of transitioning to attached state in the future.
This is a declarative, idempotent API: there are not separate endpoints
for different tenant location configurations. Rather, this single endpoint accepts
a description of the desired location configuration, and makes whatever changes
are required to reach that state.
In imperative terms, this API is used to attach and detach tenants, and
to transition tenants to and from secondary mode.
This is a synchronous API: there is no 202 response. State transitions should always
be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
the caller controls the runtime of the request.
In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
to AttachedStale and Detached. Flushing is never necessary for correctness, but is an
important optimization when doing migrations. The `flush_ms` parameter controls whether
flushing should be attempted, and how much time is allowed for flushing. If the time limit expires,
the requested transition will continue without waiting for any outstanding data to flush. Callers
should use a duration which is substantially less than their HTTP client's request
timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions
where flushing doesn't make sense, the server will ignore it.
It is safe to retry requests, but if one receives a 409 or 503 response, it is not
useful to retry aggressively: there is probably an existing request still ongoing.
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/TenantLocationConfigRequest"
responses:
"200":
description: Tenant is now in requested state
"503":
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: |
The tenant is already known to Pageserver in some way,
and hence this `/attach` call has been rejected.
Some examples of how this can happen:
- tenant was created on this pageserver
- tenant attachment was started by an earlier call to `/attach`.
Callers should poll the tenant status's `attachment_status` field,
like for status 202. See the longer description for `POST /attach`
for details.
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/detach:
parameters:
- name: tenant_id
@@ -716,6 +631,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: detach_ignored
in: query
required: false
@@ -775,6 +691,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -823,6 +740,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
@@ -879,6 +797,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Calculate tenant's synthetic size
@@ -921,6 +840,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: inputs_only
in: query
required: false
@@ -990,10 +910,11 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Create a timeline. Returns new timeline id on success.
Recreating the same timeline will succeed if the parameters match the existing timeline.
Create a timeline. Returns new timeline id on success.\
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
requestBody:
content:
@@ -1014,9 +935,6 @@ paths:
format: hex
pg_version:
type: integer
existing_initdb_timeline_id:
type: string
format: hex
responses:
"201":
description: TimelineInfo
@@ -1123,6 +1041,7 @@ paths:
application/json:
schema:
type: string
format: hex
"400":
description: Malformed tenant create request
content:
@@ -1219,6 +1138,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Returns tenant's config description: specific config overrides a tenant has
@@ -1324,6 +1244,7 @@ components:
properties:
new_tenant_id:
type: string
format: hex
generation:
type: integer
description: Attachment generation number.
@@ -1352,30 +1273,7 @@ components:
properties:
tenant_id:
type: string
TenantLocationConfigRequest:
type: object
required:
- tenant_id
properties:
tenant_id:
type: string
mode:
type: string
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
description: Mode of functionality that this pageserver will run in for this tenant.
generation:
type: integer
description: Attachment generation number, mandatory when `mode` is an attached state
secondary_conf:
$ref: '#/components/schemas/SecondaryConfig'
tenant_conf:
$ref: '#/components/schemas/TenantConfig'
SecondaryConfig:
type: object
properties:
warm:
type: boolean
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
format: hex
TenantConfig:
type: object
properties:
@@ -1405,8 +1303,6 @@ components:
type: integer
trace_read_requests:
type: boolean
heatmap_period:
type: integer
TenantConfigResponse:
type: object
properties:
@@ -1429,6 +1325,7 @@ components:
format: hex
tenant_id:
type: string
format: hex
last_record_lsn:
type: string
format: hex

File diff suppressed because it is too large Load Diff

View File

@@ -2,27 +2,19 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use async_compression::tokio::bufread::ZstdDecoder;
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::DecodedWALRecord;
@@ -41,9 +33,7 @@ use utils::lsn::Lsn;
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile_buf = std::fs::read(&controlfile_path)
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
let controlfile = ControlFileData::decode(&controlfile_buf)?;
let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
@@ -628,65 +618,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
reader.read_to_end(&mut buf).await?;
Ok(Bytes::from(buf))
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tmp_path)
.await
.with_context(|| format!("tempfile creation {tmp_path}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(pgdata_path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for path in paths {
let rel_path = path.strip_prefix(pgdata_path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&path, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
}
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
pub async fn extract_tar_zst(
pgdata_path: &Utf8Path,
tar_zst: impl AsyncBufRead + Unpin,
) -> Result<()> {
let tar = Box::pin(ZstdDecoder::new(tar_zst));
let mut archive = Archive::new(tar);
archive.unpack(pgdata_path).await?;
Ok(())
}

View File

@@ -1,12 +1,11 @@
use crate::repository::{key_range_size, singleton_range, Key};
use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
///
/// Represents a set of Keys, in a compact form.
///
#[derive(Clone, Debug, Default, PartialEq, Eq)]
#[derive(Clone, Debug, Default)]
pub struct KeySpace {
/// Contiguous ranges of keys that belong to the key space. In key order,
/// and with no overlap.
@@ -187,33 +186,6 @@ impl KeySpaceRandomAccum {
}
}
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next()
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -10,7 +10,7 @@ pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub use pageserver_api::keyspace;
pub mod keyspace;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
@@ -186,6 +186,13 @@ pub struct InitializationOrder {
/// Each initial tenant load task carries this until completion.
pub initial_tenant_load: Option<utils::completion::Completion>,
/// Barrier for when we can start initial logical size calculations.
pub initial_logical_size_can_start: utils::completion::Barrier,
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
/// attempt. It is important to drop this once the attempt has completed.
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
/// Barrier for when we can start any background jobs.
///
/// This can be broken up later on, but right now there is just one class of a background job.
@@ -205,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
match tokio::time::timeout(warn_at, &mut fut).await {
Ok(ret) => {
tracing::info!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed"
);
@@ -213,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
}
Err(_) => {
tracing::info!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
@@ -222,7 +229,7 @@ async fn timed<Fut: std::future::Future>(
// this has a global allowed_errors
tracing::warn!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, took longer than expected"
);

View File

@@ -2,13 +2,11 @@ use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use strum::{EnumCount, IntoEnumIterator, VariantNames};
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::{TenantId, TimelineId};
@@ -286,63 +284,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
#[derive(Clone, Copy)]
pub(crate) enum Outcome {
FoundSlotUnused { iters: NonZeroUsize },
FoundSlotEvicted { iters: NonZeroUsize },
ItersExceeded { iters: NonZeroUsize },
}
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
&["outcome"],
)
.expect("failed to define a metric")
});
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_calls",
"Incremented at the end of each find_victim() call.\
Filter by outcome to get e.g., eviction rate.",
&["outcome"]
)
.unwrap()
});
pub(crate) fn observe(outcome: Outcome) {
macro_rules! dry {
($label:literal, $iters:expr) => {{
static LABEL: &'static str = $label;
static ITERS_TOTAL: Lazy<IntCounter> =
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
static CALLS: Lazy<IntCounter> =
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
ITERS_TOTAL.inc_by(($iters.get()) as u64);
CALLS.inc();
}};
}
match outcome {
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
Outcome::FoundSlotEvicted { iters } => {
dry!("found_evicted", iters)
}
Outcome::ItersExceeded { iters } => {
dry!("err_iters_exceeded", iters);
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
}
}
}
}
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -352,6 +293,14 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
@@ -453,129 +402,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
pub(crate) struct StartCalculation(IntCounterVec);
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
StartCalculation(
register_int_counter_vec!(
"pageserver_initial_logical_size_start_calculation",
"Incremented each time we start an initial logical size calculation attempt. \
The `circumstances` label provides some additional details.",
&["attempt", "circumstances"]
)
.unwrap(),
)
});
struct DropCalculation {
first: IntCounter,
retry: IntCounter,
}
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
let vec = register_int_counter_vec!(
"pageserver_initial_logical_size_drop_calculation",
"Incremented each time we abort a started size calculation attmpt.",
&["attempt"]
)
.unwrap();
DropCalculation {
first: vec.with_label_values(&["first"]),
retry: vec.with_label_values(&["retry"]),
}
});
pub(crate) struct Calculated {
pub(crate) births: IntCounter,
pub(crate) deaths: IntCounter,
}
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
births: register_int_counter!(
"pageserver_initial_logical_size_finish_calculation",
"Incremented every time we finish calculation of initial logical size.\
If everything is working well, this should happen at most once per Timeline object."
)
.unwrap(),
deaths: register_int_counter!(
"pageserver_initial_logical_size_drop_finished_calculation",
"Incremented when we drop a finished initial logical size calculation result.\
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
)
.unwrap(),
});
pub(crate) struct OngoingCalculationGuard {
inc_drop_calculation: Option<IntCounter>,
}
#[derive(strum_macros::IntoStaticStr)]
pub(crate) enum StartCircumstances {
EmptyInitial,
SkippedConcurrencyLimiter,
AfterBackgroundTasksRateLimit,
}
impl StartCalculation {
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0.with_label_values(&["first", circumstances_label]);
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
}
}
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0.with_label_values(&["retry", circumstances_label]);
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
}
}
}
impl Drop for OngoingCalculationGuard {
fn drop(&mut self) {
if let Some(counter) = self.inc_drop_calculation.take() {
counter.inc();
}
}
}
impl OngoingCalculationGuard {
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
drop(self.inc_drop_calculation.take());
CALCULATED.births.inc();
FinishedCalculationGuard {
inc_on_drop: CALCULATED.deaths.clone(),
}
}
}
pub(crate) struct FinishedCalculationGuard {
inc_on_drop: IntCounter,
}
impl Drop for FinishedCalculationGuard {
fn drop(&mut self) {
self.inc_on_drop.inc();
}
}
// context: https://github.com/neondatabase/neon/issues/5963
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
Lazy::new(|| {
register_int_counter!(
"pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
"Counter for the following event: walreceiver calls\
Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
)
.unwrap()
});
}
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
@@ -651,7 +477,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
"pageserver_evictions_with_low_residence_duration",
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
Residence duration is determined using the `residence_duration_data_source`.",
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
)
.expect("failed to define a metric")
});
@@ -684,54 +510,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
.expect("Failed to register pageserver_startup_is_loading")
});
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
/// like how long it took to load.
///
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
/// metrics are rather expensive, and usually fine grained stuff makes more sense
/// at a timeline level than tenant level.
pub(crate) struct TenantMetrics {
/// How long did tenants take to go from construction to active state?
pub(crate) activation: Histogram,
pub(crate) preload: Histogram,
pub(crate) attach: Histogram,
/// How many tenants are included in the initial startup of the pagesrever?
pub(crate) startup_scheduled: IntCounter,
pub(crate) startup_complete: IntCounter,
}
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
TenantMetrics {
activation: register_histogram!(
/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
preload: register_histogram!(
"pageserver_tenant_preload_seconds",
"Time taken by tenants to load remote metadata on startup/attach, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
attach: register_histogram!(
"pageserver_tenant_attach_seconds",
"Time taken by tenants to intialize, after remote metadata is already loaded",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
startup_scheduled: register_int_counter!(
"pageserver_tenant_startup_scheduled",
"Number of tenants included in pageserver startup (doesn't count tenants attached later)"
).expect("Failed to register metric"),
startup_complete: register_int_counter!(
"pageserver_tenant_startup_complete",
"Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
tenants: such cases will lead to this metric never reaching the scheduled count."
).expect("Failed to register metric"),
}
.expect("Failed to register pageserver_tenant_activation_seconds metric")
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -755,16 +541,10 @@ impl EvictionsWithLowResidenceDurationBuilder {
}
}
fn build(
&self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
) -> EvictionsWithLowResidenceDuration {
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
.get_metric_with_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -795,24 +575,21 @@ impl EvictionsWithLowResidenceDuration {
pub fn change_threshold(
&mut self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
new_threshold: Duration,
) {
if new_threshold == self.threshold {
return;
}
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
self.data_source,
new_threshold,
)
.build(tenant_id, shard_id, timeline_id);
let mut with_new =
EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
.build(tenant_id, timeline_id);
std::mem::swap(self, &mut with_new);
with_new.remove(tenant_id, shard_id, timeline_id);
with_new.remove(tenant_id, timeline_id);
}
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
let Some(_counter) = self.counter.take() else {
return;
};
@@ -821,7 +598,6 @@ impl EvictionsWithLowResidenceDuration {
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&threshold,
@@ -862,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
///
/// Operations:
/// - open ([`std::fs::OpenOptions::open`])
/// - close (dropping [`crate::virtual_file::VirtualFile`])
/// - close (dropping [`std::fs::File`])
/// - close-by-replace (close by replacement algorithm)
/// - read (`read_at`)
/// - write (`write_at`)
@@ -874,7 +650,6 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
)]
pub(crate) enum StorageIoOperation {
Open,
OpenAfterReplace,
Close,
CloseByReplace,
Read,
@@ -888,7 +663,6 @@ impl StorageIoOperation {
pub fn as_str(&self) -> &'static str {
match self {
StorageIoOperation::Open => "open",
StorageIoOperation::OpenAfterReplace => "open-after-replace",
StorageIoOperation::Close => "close",
StorageIoOperation::CloseByReplace => "close-by-replace",
StorageIoOperation::Read => "read",
@@ -943,25 +717,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_virtual_file_descriptor_cache_size_max",
"Maximum number of open file descriptors in the cache."
)
.unwrap()
});
// SIZE_CURRENT: derive it like so:
// ```
// sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
// -ignoring(operation)
// sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
// ```
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
@@ -1288,52 +1043,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
}
});
pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
)
.expect("failed to define a metric"),
records_filtered: register_int_counter!(
"pageserver_wal_ingest_records_filtered",
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
)
.expect("failed to define a metric"),
upload_heatmap_errors: register_int_counter!(
"pageserver_secondary_upload_heatmap_errors",
"Failures writing heatmap to remote storage"
)
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1384,16 +1093,25 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
@@ -1534,20 +1252,9 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_process_launch_duration",
"Histogram of the duration of successful WalRedoProcess::launch calls",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
pub(crate) struct WalRedoProcessCounters {
pub(crate) started: IntCounter,
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
pub(crate) active_stderr_logger_tasks_started: IntCounter,
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
}
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1571,19 +1278,6 @@ impl Default for WalRedoProcessCounters {
&["cause"],
)
.unwrap();
let active_stderr_logger_tasks_started = register_int_counter!(
"pageserver_walredo_stderr_logger_tasks_started_total",
"Number of active walredo stderr logger tasks that have started",
)
.unwrap();
let active_stderr_logger_tasks_finished = register_int_counter!(
"pageserver_walredo_stderr_logger_tasks_finished_total",
"Number of active walredo stderr logger tasks that have finished",
)
.unwrap();
Self {
started,
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1591,8 +1285,6 @@ impl Default for WalRedoProcessCounters {
let cause_str: &'static str = cause.into();
killed.with_label_values(&[cause_str])
})),
active_stderr_logger_tasks_started,
active_stderr_logger_tasks_finished,
}
}
}
@@ -1667,7 +1359,6 @@ impl StorageTimeMetrics {
#[derive(Debug)]
pub struct TimelineMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
@@ -1688,12 +1379,11 @@ pub struct TimelineMetrics {
impl TimelineMetrics {
pub fn new(
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1730,12 +1420,11 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
TimelineMetrics {
tenant_id,
shard_id,
timeline_id,
flush_time_histo,
compact_time_histo,
@@ -1781,7 +1470,6 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1795,7 +1483,7 @@ impl Drop for TimelineMetrics {
self.evictions_with_low_residence_duration
.write()
.unwrap()
.remove(tenant_id, shard_id, timeline_id);
.remove(tenant_id, timeline_id);
// The following metrics are born outside of the TimelineMetrics lifecycle but still
// removed at the end of it. The idea is to have the metrics outlive the
@@ -1883,9 +1571,9 @@ pub struct RemoteTimelineClientMetrics {
}
impl RemoteTimelineClientMetrics {
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_shard_id.tenant_id.to_string(),
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
@@ -2253,14 +1941,9 @@ pub fn preinitialize_metrics() {
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// Tenant stats
Lazy::force(&TENANT);
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
@@ -2278,7 +1961,6 @@ pub fn preinitialize_metrics() {
&WAL_REDO_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
]
.into_iter()
.for_each(|h| {

View File

@@ -28,7 +28,7 @@
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,15 +83,13 @@ use std::{
use anyhow::Context;
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
repository::Key,
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -152,13 +150,7 @@ enum CacheKey {
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
struct MaterializedPageHashKey {
/// Why is this TenantShardId rather than TenantId?
///
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
/// special-cased in some other way.
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
}
@@ -382,7 +374,7 @@ impl PageCache {
/// returned page.
pub async fn lookup_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
@@ -399,7 +391,7 @@ impl PageCache {
let mut cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key: *key,
},
@@ -440,7 +432,7 @@ impl PageCache {
///
pub async fn memorize_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
lsn: Lsn,
@@ -448,7 +440,7 @@ impl PageCache {
) -> anyhow::Result<()> {
let cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key,
},
@@ -905,10 +897,8 @@ impl PageCache {
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::ItersExceeded {
iters: iters.try_into().unwrap(),
},
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
);
anyhow::bail!("exceeded evict iter limit");
}
@@ -919,18 +909,8 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.key = None;
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
iters: iters.try_into().unwrap(),
},
);
} else {
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
iters: iters.try_into().unwrap(),
},
);
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}

View File

@@ -53,23 +53,21 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::rel_block_to_key;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
/// Read the end of a tar archive.
///
@@ -401,25 +399,18 @@ impl PageServerHandler {
{
debug_assert_current_span_has_tenant_and_timeline_id();
// Note that since one connection may contain getpage requests that target different
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
// that we look up here may not be the one that serves all the actual requests: we will double
// check the mapping of key->shard later before calling into Timeline for getpage requests.
// Make request tracer if needed
let tenant = mgr::get_active_tenant_with_timeout(
tenant_id,
ShardSelector::First,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
// Make request tracer if needed
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path =
tenant
.conf
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
let path = tenant
.conf
.trace_path(&tenant_id, &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
@@ -571,7 +562,6 @@ impl PageServerHandler {
info!("creating new timeline");
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
@@ -634,7 +624,7 @@ impl PageServerHandler {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
@@ -813,49 +803,9 @@ impl PageServerHandler {
}
*/
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
} else {
// The Tenant shard we looked up at connection start does not hold this particular
// key: look for other shards in this tenant. This scenario occurs if a pageserver
// has multiple shards for the same tenant.
//
// TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
let timeline = match self
.get_active_tenant_timeline(
timeline.tenant_shard_id.tenant_id,
timeline.timeline_id,
ShardSelector::Page(key),
)
.await
{
Ok(t) => t,
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node.
// TODO: this should be some kind of structured error that the client will understand,
// so that it can block until its config is updated: this error is expected in the case
// that the Tenant's shards' placements are being updated and the client hasn't been
// informed yet.
//
// https://github.com/neondatabase/neon/issues/6038
return Err(anyhow::anyhow!("Request routed to wrong shard"));
}
Err(e) => return Err(e.into()),
};
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
};
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
@@ -884,7 +834,7 @@ impl PageServerHandler {
// check that the timeline exists
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
@@ -990,11 +940,9 @@ impl PageServerHandler {
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(
tenant_id,
selector,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
@@ -1168,7 +1116,7 @@ where
self.check_permission(Some(tenant_id))?;
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1355,7 +1303,6 @@ where
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)

View File

@@ -13,7 +13,6 @@ use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -22,7 +21,6 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -283,10 +281,6 @@ impl Timeline {
}
/// Get a list of all existing relations in given tablespace and database.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn list_rels(
&self,
spcnode: Oid,
@@ -371,7 +365,6 @@ impl Timeline {
pub async fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<LsnForTimestamp, PageReconstructError> {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
@@ -390,9 +383,6 @@ impl Timeline {
let mut found_smaller = false;
let mut found_larger = false;
while low < high {
if cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);
}
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;
@@ -635,10 +625,6 @@ impl Timeline {
///
/// Only relation blocks are counted currently. That excludes metadata,
/// SLRUs, twophase files etc.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
@@ -822,7 +808,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
@@ -930,7 +919,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory as well
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
}
if r.is_none() {
// Create RelDirectory
@@ -1255,14 +1247,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
Ok(())
}
pub async fn put_file(
&mut self,
path: &str,
@@ -1325,7 +1309,7 @@ impl<'a> DatadirModification<'a> {
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
@@ -1370,10 +1354,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub(crate) fn is_empty(&self) -> bool {
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
}
// Internal helper functions to batch the modifications
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
@@ -1585,7 +1565,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
}
}
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
@@ -1769,13 +1749,6 @@ const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
@@ -1791,6 +1764,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
})
}
fn is_rel_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}

View File

@@ -2,11 +2,38 @@ use crate::walrecord::NeonWalRecord;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::ops::AddAssign;
use std::ops::{AddAssign, Range};
use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next()
}
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(test, derive(PartialEq))]
@@ -111,14 +138,6 @@ pub struct GcResult {
#[serde(serialize_with = "serialize_duration_as_millis")]
pub elapsed: Duration,
/// The layers which were garbage collected.
///
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
/// dropped in tests.
#[cfg(feature = "testing")]
#[serde(skip)]
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
}
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
@@ -139,11 +158,5 @@ impl AddAssign for GcResult {
self.layers_removed += other.layers_removed;
self.elapsed += other.elapsed;
#[cfg(feature = "testing")]
{
let mut other = other;
self.doomed_layers.append(&mut other.doomed_layers);
}
}
}

View File

@@ -42,7 +42,6 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use futures::FutureExt;
use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle;
use tokio::task_local;
@@ -52,7 +51,7 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use crate::shutdown_pageserver;
@@ -258,9 +257,6 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryUploads,
// Initial logical size calculation
InitialLogicalSizeCalculation,
@@ -321,7 +317,7 @@ struct PageServerTask {
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>,
@@ -333,7 +329,7 @@ struct PageServerTask {
pub fn spawn<F>(
runtime: &tokio::runtime::Handle,
kind: TaskKind,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
name: &str,
shutdown_process_on_error: bool,
@@ -349,7 +345,7 @@ where
kind,
name: name.to_string(),
cancel: cancel.clone(),
tenant_shard_id,
tenant_id,
timeline_id,
mutable: Mutex::new(MutableTaskState { join_handle: None }),
});
@@ -428,28 +424,28 @@ async fn task_finish(
Ok(Err(err)) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
Err(err) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
@@ -471,11 +467,11 @@ async fn task_finish(
///
/// Or to shut down all tasks for given timeline:
///
/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
///
pub async fn shutdown_tasks(
kind: Option<TaskKind>,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
) {
let mut victim_tasks = Vec::new();
@@ -484,35 +480,35 @@ pub async fn shutdown_tasks(
let tasks = TASKS.lock().unwrap();
for task in tasks.values() {
if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push((
Arc::clone(task),
task.kind,
task.tenant_shard_id,
task.tenant_id,
task.timeline_id,
));
}
}
}
let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
};
if let Some(mut join_handle) = join_handle {
if log_all {
if tenant_shard_id.is_none() {
if tenant_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -521,13 +517,12 @@ pub async fn shutdown_tasks(
{
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for task {} to shut down", task.name);
info!("waiting for {} to shut down", task.name);
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)
// - task errors are already logged in the wrapper
let _ = join_handle.await;
info!("task {} completed", task.name);
}
} else {
// Possibly one of:
@@ -561,14 +556,9 @@ pub async fn shutdown_watcher() {
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
/// `tokio::task::JoinSet::spawn`.
pub fn shutdown_token() -> CancellationToken {
let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
if cfg!(test) {
// in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
res.unwrap_or_default()
} else {
res.expect("shutdown_token() called in an unexpected task or thread")
}
SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_token() called in an unexpected task or thread")
}
/// Has the current task been requested to shut down?

File diff suppressed because it is too large Load Diff

View File

@@ -8,12 +8,9 @@
//! We cannot use global or default config instead, because wrong settings
//! may lead to a data loss.
//!
use anyhow::bail;
use anyhow::Context;
use pageserver_api::models;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::num::NonZeroU64;
use std::time::Duration;
use utils::generation::Generation;
@@ -91,14 +88,6 @@ pub(crate) struct LocationConf {
/// The location-specific part of the configuration, describes the operating
/// mode of this pageserver for this tenant.
pub(crate) mode: LocationMode,
/// The detailed shard identity. This structure is already scoped within
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
/// key->shard mappings.
#[serde(default = "ShardIdentity::unsharded")]
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
pub(crate) shard: ShardIdentity,
/// The pan-cluster tenant configuration, the same on all locations
pub(crate) tenant_conf: TenantConfOpt,
}
@@ -171,8 +160,6 @@ impl LocationConf {
generation,
attach_mode: AttachmentMode::Single,
}),
// Legacy configuration loads are always from tenants created before sharding existed.
shard: ShardIdentity::unsharded(),
tenant_conf,
}
}
@@ -200,7 +187,6 @@ impl LocationConf {
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
conf.generation
.map(Generation::new)
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
}
@@ -240,21 +226,7 @@ impl LocationConf {
}
};
let shard = if conf.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(conf.shard_number),
ShardCount(conf.shard_count),
ShardStripeSize(conf.shard_stripe_size),
)?
};
Ok(Self {
shard,
mode,
tenant_conf,
})
Ok(Self { mode, tenant_conf })
}
}
@@ -269,7 +241,6 @@ impl Default for LocationConf {
attach_mode: AttachmentMode::Single,
}),
tenant_conf: TenantConfOpt::default(),
shard: ShardIdentity::unsharded(),
}
}
}
@@ -334,11 +305,6 @@ pub struct TenantConf {
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
/// may be disabled if a Tenant will not have secondary locations: only secondary
/// locations will use the heatmap uploaded by attached locations.
pub heatmap_period: Duration,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -419,11 +385,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub heatmap_period: Option<Duration>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -492,7 +453,6 @@ impl TenantConfOpt {
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
}
}
}
@@ -530,54 +490,109 @@ impl Default for TenantConf {
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
heatmap_period: Duration::ZERO,
}
}
}
// Helper function to standardize the error messages we produce on bad durations
//
// Intended to be used with anyhow's `with_context`, e.g.:
//
// let value = result.with_context(bad_duration("name", &value))?;
//
fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
move || format!("Cannot parse `{field_name}` duration {value:?}")
}
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
// Convert the request_data to a JSON Value
let json_value: Value = serde_json::to_value(request_data)?;
let mut tenant_conf = TenantConfOpt::default();
// Create a Deserializer from the JSON Value
let deserializer = json_value.into_deserializer();
if let Some(gc_period) = &request_data.gc_period {
tenant_conf.gc_period = Some(
humantime::parse_duration(gc_period)
.with_context(bad_duration("gc_period", gc_period))?,
);
}
tenant_conf.gc_horizon = request_data.gc_horizon;
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
// Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;
if let Some(pitr_interval) = &request_data.pitr_interval {
tenant_conf.pitr_interval = Some(
humantime::parse_duration(pitr_interval)
.with_context(bad_duration("pitr_interval", pitr_interval))?,
);
}
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
tenant_conf.walreceiver_connect_timeout = Some(
humantime::parse_duration(walreceiver_connect_timeout).with_context(
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
)?,
);
}
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
tenant_conf.lagging_wal_timeout = Some(
humantime::parse_duration(lagging_wal_timeout)
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
);
}
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
tenant_conf.checkpoint_timeout = Some(
humantime::parse_duration(checkpoint_timeout)
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
);
}
tenant_conf.compaction_target_size = request_data.compaction_target_size;
tenant_conf.compaction_threshold = request_data.compaction_threshold;
if let Some(compaction_period) = &request_data.compaction_period {
tenant_conf.compaction_period = Some(
humantime::parse_duration(compaction_period)
.with_context(bad_duration("compaction_period", compaction_period))?,
);
}
if let Some(eviction_policy) = &request_data.eviction_policy {
tenant_conf.eviction_policy = Some(
serde::Deserialize::deserialize(eviction_policy)
.context("parse field `eviction_policy`")?,
);
}
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
if let Some(evictions_low_residence_duration_metric_threshold) =
&request_data.evictions_low_residence_duration_metric_threshold
{
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
.with_context(bad_duration(
"evictions_low_residence_duration_metric_threshold",
evictions_low_residence_duration_metric_threshold,
))?,
);
}
tenant_conf.gc_feedback = request_data.gc_feedback;
Ok(tenant_conf)
}
}
impl TryFrom<toml_edit::Item> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
match item {
toml_edit::Item::Value(value) => {
let d = value.into_deserializer();
return serde_path_to_error::deserialize(d)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
toml_edit::Item::Table(table) => {
let deserializer = toml_edit::de::Deserializer::new(table.into());
return serde_path_to_error::deserialize(deserializer)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
_ => {
bail!("expected non-inline table but found {item}")
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use models::TenantConfig;
#[test]
fn de_serializing_pageserver_config_omits_empty_values() {
@@ -594,38 +609,4 @@ mod tests {
assert_eq!(json_form, "{\"gc_horizon\":42}");
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
}
#[test]
fn test_try_from_models_tenant_config_err() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some("5a".to_string()),
..TenantConfig::default()
};
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
assert!(
tenant_conf_opt.is_err(),
"Suceeded to convert TenantConfig to TenantConfOpt"
);
let expected_error_str =
"lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
}
#[test]
fn test_try_from_models_tenant_config_success() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some("5s".to_string()),
..TenantConfig::default()
};
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
assert_eq!(
tenant_conf_opt.lagging_wal_timeout,
Some(Duration::from_secs(5))
);
}
}

View File

@@ -2,19 +2,22 @@ use std::sync::Arc;
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::{models::TenantState, shard::TenantShardId};
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, Instrument, Span};
use tracing::{error, instrument, warn, Instrument, Span};
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
use utils::{
backoff, completion, crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
InitializationOrder,
};
use super::{
@@ -56,10 +59,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
fn remote_tenant_delete_mark_path(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> anyhow::Result<RemotePath> {
let tenant_remote_path = conf
.tenant_path(tenant_shard_id)
.tenant_path(tenant_id)
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
@@ -70,25 +73,23 @@ fn remote_tenant_delete_mark_path(
async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
let data: &[u8] = &[];
backoff::retry(
|| async {
let data = bytes::Bytes::from_static(data);
let stream = futures::stream::once(futures::future::ready(Ok(data)));
remote_storage
.upload(stream, 0, &remote_mark_path, None)
.upload(data, 0, &remote_mark_path, None)
.await
},
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("mark_upload")?;
@@ -98,9 +99,9 @@ async fn create_remote_delete_mark(
async fn create_local_delete_mark(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
@@ -169,18 +170,18 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
backoff::retry(
|| async { remote_storage.delete(&path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("remove_tenant_remote_delete_mark")?;
@@ -191,7 +192,7 @@ async fn remove_tenant_remote_delete_mark(
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
async fn cleanup_remaining_fs_traces(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
if is_dir {
@@ -203,8 +204,8 @@ async fn cleanup_remaining_fs_traces(
.with_context(|| format!("failed to delete {p}"))
};
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
rm(conf.tenant_config_path(tenant_id), false).await?;
rm(conf.tenant_location_config_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
Err(anyhow::anyhow!(
@@ -212,7 +213,7 @@ async fn cleanup_remaining_fs_traces(
))?
});
rm(conf.timelines_path(tenant_shard_id), true).await?;
rm(conf.timelines_path(tenant_id), true).await?;
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
Err(anyhow::anyhow!(
@@ -226,14 +227,14 @@ async fn cleanup_remaining_fs_traces(
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let tenant_path = &conf.tenant_path(tenant_shard_id);
let tenant_path = &conf.tenant_path(tenant_id);
if tenant_path.exists() {
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
.await
.context("fsync_pre_mark_remove")?;
}
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
@@ -241,7 +242,7 @@ async fn cleanup_remaining_fs_traces(
))?
});
rm(conf.tenant_path(tenant_shard_id), true).await?;
rm(conf.tenant_path(tenant_id), true).await?;
Ok(())
}
@@ -286,8 +287,6 @@ impl DeleteTenantFlow {
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
pausable_failpoint!("tenant-delete-before-run");
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
@@ -322,15 +321,9 @@ impl DeleteTenantFlow {
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(
conf,
remote_storage,
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await
.context("remote_mark")?
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -339,7 +332,7 @@ impl DeleteTenantFlow {
))?
});
create_local_delete_mark(conf, &tenant.tenant_shard_id)
create_local_delete_mark(conf, &tenant.tenant_id)
.await
.context("local delete mark")?;
@@ -381,11 +374,9 @@ impl DeleteTenantFlow {
return Ok(acquire(tenant));
}
let tenant_id = tenant.tenant_id;
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
if conf
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
.exists()
{
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
Ok(acquire(tenant))
} else {
Ok(None)
@@ -397,6 +388,7 @@ impl DeleteTenantFlow {
tenant: &Arc<Tenant>,
preload: Option<TenantPreload>,
tenants: &'static std::sync::RwLock<TenantsMap>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
@@ -406,7 +398,10 @@ impl DeleteTenantFlow {
.await
.expect("cant be stopping or broken");
tenant.attach(preload, ctx).await.context("attach")?;
tenant
.attach(init_order, preload, ctx)
.await
.context("attach")?;
Self::background(
guard,
@@ -464,12 +459,12 @@ impl DeleteTenantFlow {
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(tenant_id),
None,
"tenant_delete",
false,
@@ -483,7 +478,7 @@ impl DeleteTenantFlow {
Ok(())
}
.instrument({
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
span.follows_from(Span::current());
span
}),
@@ -521,7 +516,7 @@ impl DeleteTenantFlow {
}
}
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
let timelines_path = conf.timelines_path(&tenant.tenant_id);
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
if timelines_path.exists() {
// sanity check to guard against layout changes
@@ -530,14 +525,7 @@ impl DeleteTenantFlow {
.context("timelines dir not empty")?;
}
remove_tenant_remote_delete_mark(
conf,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await?;
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(
@@ -545,73 +533,21 @@ impl DeleteTenantFlow {
))?
});
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
.await
.context("cleanup_remaining_fs_traces")?;
{
pausable_failpoint!("tenant-delete-before-map-remove");
let mut locked = tenants.write().unwrap();
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
//
// This complexity will go away when we simplify how deletion works:
// https://github.com/neondatabase/neon/issues/5080
loop {
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
let barrier = {
let mut locked = tenants.write().unwrap();
let removed = locked.remove(tenant.tenant_shard_id);
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
match removed {
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
match tenant.current_state() {
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
// Expected: we put the tenant into stopping state before we start deleting it
}
state => {
// Unexpected state
tracing::warn!(
"Tenant in unexpected state {state} after deletion"
);
}
}
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
// This is unexpected: this secondary tenants should not have been created, and we
// are not in a position to shut it down from here.
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
}
TenantsMapRemoveResult::Vacant => {
tracing::warn!(
"Tenant removed from TenantsMap before deletion completed"
);
break;
}
TenantsMapRemoveResult::InProgress(barrier) => {
// An InProgress entry was found, we must wait on its barrier
barrier
}
}
};
tracing::info!(
"Waiting for competing operation to complete before deleting state for tenant"
);
barrier.wait().await;
}
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
}
*guard = Self::Finished;

View File

@@ -7,19 +7,18 @@ use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
_tenant_shard_id: TenantShardId,
_tenant_id: TenantId,
_timeline_id: TimelineId,
file: VirtualFile,
len: u64,
@@ -32,7 +31,7 @@ pub struct EphemeralFile {
impl EphemeralFile {
pub async fn create(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -40,7 +39,7 @@ impl EphemeralFile {
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let filename = conf
.timeline_path(&tenant_shard_id, &timeline_id)
.timeline_path(&tenant_id, &timeline_id)
.join(Utf8PathBuf::from(format!(
"ephemeral-{filename_disambiguator}"
)));
@@ -53,7 +52,7 @@ impl EphemeralFile {
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
_tenant_shard_id: tenant_shard_id,
_tenant_id: tenant_id,
_timeline_id: timeline_id,
file,
len: 0,
@@ -283,7 +282,7 @@ mod tests {
) -> Result<
(
&'static PageServerConf,
TenantShardId,
TenantId,
TimelineId,
RequestContext,
),
@@ -296,13 +295,13 @@ mod tests {
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
Ok((conf, tenant_shard_id, timeline_id, ctx))
Ok((conf, tenant_id, timeline_id, ctx))
}
#[tokio::test]

View File

@@ -11,12 +11,15 @@
use std::io::{self};
use anyhow::{ensure, Context};
use pageserver_api::shard::TenantShardId;
use serde::{de::Error, Deserialize, Serialize, Serializer};
use thiserror::Error;
use utils::bin_ser::SerializeError;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
@@ -269,14 +272,14 @@ impl Serialize for TimelineMetadata {
}
/// Save timeline metadata to file
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
pub async fn save_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
data: &TimelineMetadata,
) -> anyhow::Result<()> {
let path = conf.metadata_path(tenant_shard_id, timeline_id);
let path = conf.metadata_path(tenant_id, timeline_id);
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
@@ -296,10 +299,10 @@ pub enum LoadMetadataError {
pub fn load_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
) -> Result<TimelineMetadata, LoadMetadataError> {
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
let metadata_bytes = std::fs::read(metadata_path)?;
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)

File diff suppressed because it is too large Load Diff

View File

@@ -180,7 +180,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
pub(crate) mod download;
mod download;
pub mod index;
mod upload;
@@ -188,20 +188,15 @@ use anyhow::Context;
use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
pub(crate) use upload::upload_initdb_dir;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use std::ops::DerefMut;
@@ -254,11 +249,6 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
// retries. Uploads and deletions are retried forever, though.
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
/// Default buffer size when interfacing with [`tokio::fs::File`].
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -307,7 +297,7 @@ pub struct RemoteTimelineClient {
runtime: tokio::runtime::Handle,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: Generation,
@@ -318,47 +308,6 @@ pub struct RemoteTimelineClient {
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
cancel: CancellationToken,
}
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
///
/// This is a convenience for the various upload functions. In future
/// the anyhow::Error result should be replaced with a more structured type that
/// enables callers to avoid handling shutdown as an error.
async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
where
F: std::future::Future<Output = anyhow::Result<()>>,
{
match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => Err(e),
Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
}
}
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
async fn download_cancellable<F, R>(
cancel: &CancellationToken,
future: F,
) -> Result<R, DownloadError>
where
F: std::future::Future<Output = Result<R, DownloadError>>,
{
match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
Ok(Ok(r)) => Ok(r),
Ok(Err(e)) => Err(e),
Err(TimeoutCancellableError::Timeout) => {
Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
}
Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
}
}
impl RemoteTimelineClient {
@@ -372,7 +321,7 @@ impl RemoteTimelineClient {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: Generation,
) -> RemoteTimelineClient {
@@ -384,17 +333,13 @@ impl RemoteTimelineClient {
} else {
BACKGROUND_RUNTIME.handle().clone()
},
tenant_shard_id,
tenant_id,
timeline_id,
generation,
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&tenant_shard_id,
&timeline_id,
)),
cancel: CancellationToken::new(),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
}
@@ -515,13 +460,13 @@ impl RemoteTimelineClient {
let index_part = download::download_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.generation,
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
@@ -545,7 +490,6 @@ impl RemoteTimelineClient {
&self,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
cancel: &CancellationToken,
) -> anyhow::Result<u64> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -558,14 +502,13 @@ impl RemoteTimelineClient {
download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_shard_id,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
@@ -711,10 +654,10 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let with_metadata =
let with_generations =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
@@ -749,7 +692,7 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> Vec<(LayerFileName, LayerFileMetadata)>
) -> Vec<(LayerFileName, Generation)>
where
I: IntoIterator<Item = LayerFileName>,
{
@@ -757,17 +700,16 @@ impl RemoteTimelineClient {
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Decorate our list of names with each name's metadata, dropping
// names that are unexpectedly missing from our metadata. This metadata
// is later used when physically deleting layers, to construct key paths.
let with_metadata: Vec<_> = names
// Decorate our list of names with each name's generation, dropping
// names that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.filter_map(|name| {
let meta = upload_queue.latest_files.remove(&name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name, meta))
Some((name, meta.generation))
} else {
// This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange
@@ -780,10 +722,9 @@ impl RemoteTimelineClient {
.collect();
#[cfg(feature = "testing")]
for (name, metadata) in &with_metadata {
let gen = metadata.generation;
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
if unexpected == gen {
for (name, gen) in &with_generations {
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
if &unexpected == gen {
tracing::error!("{name} was unlinked twice with same generation");
} else {
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
@@ -798,14 +739,14 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
with_metadata
with_generations
}
/// Schedules deletion for layer files which have previously been unlinked from the
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -818,22 +759,16 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
with_generations: Vec<(LayerFileName, Generation)>,
) {
for (name, meta) in &with_metadata {
info!(
"scheduling deletion of layer {}{} (shard {})",
name,
meta.generation.get_suffix(),
meta.shard
);
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
#[cfg(feature = "testing")]
for (name, meta) in &with_metadata {
let gen = meta.generation;
for (name, gen) in &with_generations {
match upload_queue.dangling_files.remove(name) {
Some(same) if same == gen => { /* expected */ }
Some(same) if &same == gen => { /* expected */ }
Some(other) => {
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
}
@@ -845,7 +780,7 @@ impl RemoteTimelineClient {
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_metadata,
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -874,29 +809,23 @@ impl RemoteTimelineClient {
Ok(())
}
///
/// Wait for all previously scheduled uploads/deletions to complete
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
///
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
let mut receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue)
self.schedule_barrier(upload_queue)
};
if receiver.changed().await.is_err() {
anyhow::bail!("wait_completion aborted because upload queue was stopped");
}
Ok(())
}
pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue);
Ok(())
}
fn schedule_barrier0(
fn schedule_barrier(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
) -> tokio::sync::watch::Receiver<()> {
@@ -912,56 +841,6 @@ impl RemoteTimelineClient {
receiver
}
/// Wait for all previously scheduled operations to complete, and then stop.
///
/// Not cancellation safe
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
// On cancellation the queue is left in ackward state of refusing new operations but
// proper stop is yet to be called. On cancel the original or some later task must call
// `stop` or `shutdown`.
let sg = scopeguard::guard((), |_| {
tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
});
let fut = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = match &mut *guard {
UploadQueue::Stopped(_) => return Ok(()),
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
UploadQueue::Initialized(ref mut init) => init,
};
// if the queue is already stuck due to a shutdown operation which was cancelled, then
// just don't add more of these as they would never complete.
//
// TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue
// in every place we would not have to jump through this hoop, and this method could be
// made cancellable.
if !upload_queue.shutting_down {
upload_queue.shutting_down = true;
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
// this operation is not counted similar to Barrier
self.launch_queued_tasks(upload_queue);
}
upload_queue.shutdown_ready.clone().acquire_owned()
};
let res = fut.await;
scopeguard::ScopeGuard::into_inner(sg);
match res {
Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"),
Err(_closed) => {
// expected
}
}
self.stop()
}
/// Set the deleted_at field in the remote index file.
///
/// This fails if the upload queue has not been `stop()`ed.
@@ -1013,11 +892,10 @@ impl RemoteTimelineClient {
|| {
upload::upload_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.generation,
&index_part_with_deleted_at,
&self.cancel,
)
},
|_e| false,
@@ -1027,7 +905,8 @@ impl RemoteTimelineClient {
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
@@ -1071,9 +950,8 @@ impl RemoteTimelineClient {
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.tenant_id,
&self.timeline_id,
meta.shard,
&file_name,
meta.generation,
)
@@ -1086,7 +964,7 @@ impl RemoteTimelineClient {
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
@@ -1122,22 +1000,12 @@ impl RemoteTimelineClient {
.unwrap_or(
// No generation-suffixed indices, assume we are dealing with
// a legacy index.
remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
);
let remaining_layers: Vec<RemotePath> = remaining
.into_iter()
.filter(|p| {
if p == &latest_index {
return false;
}
if let Some(name) = p.object_name() {
if name == INITDB_PATH {
return false;
}
}
true
})
.filter(|p| p!= &latest_index)
.inspect(|path| {
if let Some(name) = path.object_name() {
info!(%name, "deleting a file not referenced from index_part.json");
@@ -1203,9 +1071,7 @@ impl RemoteTimelineClient {
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
}
UploadOp::Barrier(_) | UploadOp::Shutdown => {
upload_queue.inprogress_tasks.is_empty()
}
UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
};
// If we cannot launch this task, don't look any further.
@@ -1218,13 +1084,6 @@ impl RemoteTimelineClient {
break;
}
if let UploadOp::Shutdown = next_op {
// leave the op in the queue but do not start more tasks; it will be dropped when
// the stop is called.
upload_queue.shutdown_ready.close();
break;
}
// We can launch this task. Remove it from the queue first.
let next_op = upload_queue.queued_operations.pop_front().unwrap();
@@ -1245,7 +1104,6 @@ impl RemoteTimelineClient {
sender.send_replace(());
continue;
}
UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
};
// Assign unique ID to this task
@@ -1264,12 +1122,12 @@ impl RemoteTimelineClient {
// Spawn task to perform the task
let self_rc = Arc::clone(self);
let tenant_shard_id = self.tenant_shard_id;
let tenant_id = self.tenant_id;
let timeline_id = self.timeline_id;
task_mgr::spawn(
&self.runtime,
TaskKind::RemoteUploadTask,
Some(self.tenant_shard_id),
Some(self.tenant_id),
Some(self.timeline_id),
"remote upload",
false,
@@ -1277,7 +1135,7 @@ impl RemoteTimelineClient {
self_rc.perform_upload_task(task).await;
Ok(())
}
.instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
);
// Loop back to process next task
@@ -1327,10 +1185,9 @@ impl RemoteTimelineClient {
path,
layer_metadata,
self.generation,
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
@@ -1350,14 +1207,13 @@ impl RemoteTimelineClient {
let res = upload::upload_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.generation,
index_part,
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
@@ -1373,22 +1229,20 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => {
pausable_failpoint!("before-delete-layer-pausable");
self.deletion_queue_client
.push_layers(
self.tenant_shard_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e))
}
unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
UploadOp::Delete(delete) => self
.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e)),
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
warn!("unexpected {unexpected:?} operation in perform_upload_task");
warn!("unexpected Barrier operation in perform_upload_task");
break;
}
};
@@ -1482,7 +1336,7 @@ impl RemoteTimelineClient {
upload_queue.num_inprogress_deletions -= 1;
None
}
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
UploadOp::Barrier(_) => unreachable!(),
};
// Launch any queued tasks that were unblocked by this one.
@@ -1496,7 +1350,7 @@ impl RemoteTimelineClient {
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
self.deletion_queue_client
.update_remote_consistent_lsn(
self.tenant_shard_id,
self.tenant_id,
self.timeline_id,
self.generation,
lsn,
@@ -1537,7 +1391,7 @@ impl RemoteTimelineClient {
reason: "should we track deletes? positive or negative sign?",
},
),
UploadOp::Barrier(..) | UploadOp::Shutdown => {
UploadOp::Barrier(_) => {
// we do not account these
return None;
}
@@ -1563,13 +1417,10 @@ impl RemoteTimelineClient {
}
/// Close the upload queue for new operations and cancel queued operations.
///
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
///
/// In-progress operations will still be running after this function returns.
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
/// to wait for them to complete, after calling this function.
pub(crate) fn stop(&self) -> Result<(), StopError> {
pub fn stop(&self) -> Result<(), StopError> {
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
@@ -1607,8 +1458,6 @@ impl RemoteTimelineClient {
queued_operations: VecDeque::default(),
#[cfg(feature = "testing")]
dangling_files: HashMap::default(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
};
let upload_queue = std::mem::replace(
@@ -1652,51 +1501,26 @@ impl RemoteTimelineClient {
}
}
}
pub(crate) fn get_layers_metadata(
&self,
layers: Vec<LayerFileName>,
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
let q = self.upload_queue.lock().unwrap();
let q = match &*q {
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", q.as_str())
}
UploadQueue::Initialized(inner) => inner,
};
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
Ok(decorated.collect())
}
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_timeline_path(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> RemotePath {
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
}
/// Note that the shard component of a remote layer path is _not_ always the same
/// as in the TenantShardId of the caller: tenants may reference layers from a different
/// ShardIndex. Use the ShardIndex from the layer's metadata.
pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
layer_file_name: &LayerFileName,
generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
generation.get_suffix()
);
@@ -1704,33 +1528,19 @@ pub fn remote_layer_path(
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
))
.expect("Failed to construct path")
}
pub fn remote_index_path(
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
generation: Generation,
) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
IndexPart::FILE_NAME,
generation.get_suffix()
))
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {
@@ -1866,17 +1676,16 @@ mod tests {
Arc::new(RemoteTimelineClient {
conf: self.harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_shard_id: self.harness.tenant_shard_id,
tenant_id: self.harness.tenant_id,
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_shard_id,
&self.harness.tenant_id,
&TIMELINE_ID,
)),
cancel: CancellationToken::new(),
})
}
@@ -1950,7 +1759,6 @@ mod tests {
println!("remote_timeline_dir: {remote_timeline_dir}");
let generation = harness.generation;
let shard = harness.shard;
// Create a couple of dummy files, schedule upload for them
@@ -1967,7 +1775,7 @@ mod tests {
harness.conf,
&timeline,
name,
LayerFileMetadata::new(contents.len() as u64, generation, shard),
LayerFileMetadata::new(contents.len() as u64, generation),
)
}).collect::<Vec<_>>();
@@ -2116,7 +1924,7 @@ mod tests {
harness.conf,
&timeline,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
);
#[derive(Debug, PartialEq, Clone, Copy)]
@@ -2202,12 +2010,7 @@ mod tests {
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(
&test_state.harness.tenant_shard_id,
&TIMELINE_ID,
generation,
)
.get_path(),
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
);
eprintln!("Writing {index_path}");
std::fs::write(&index_path, index_part_bytes).unwrap();

Some files were not shown because too many files have changed in this diff Show More