mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 06:30:43 +00:00
Compare commits
9 Commits
skyzh/uplo
...
ci-run/pr-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7cdb292b37 | ||
|
|
a55e0192dc | ||
|
|
fa3ceab30e | ||
|
|
7710c18761 | ||
|
|
1e14d784f4 | ||
|
|
2574bbe072 | ||
|
|
d74c715602 | ||
|
|
69cfd1f7e0 | ||
|
|
5dea3e2195 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -6,7 +6,6 @@ self-hosted-runner:
|
||||
- small
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
@@ -70,7 +70,6 @@ runs:
|
||||
|
||||
- name: Install Allure
|
||||
shell: bash -euxo pipefail {0}
|
||||
working-directory: /tmp
|
||||
run: |
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
|
||||
@@ -113,6 +113,8 @@ runs:
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FAILED: ${{ inputs.rerun_failed }}
|
||||
PG_VERSION: ${{ inputs.pg_version }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
|
||||
12
.github/workflows/_build-and-test-locally.yml
vendored
12
.github/workflows/_build-and-test-locally.yml
vendored
@@ -272,13 +272,10 @@ jobs:
|
||||
# run pageserver tests with different settings
|
||||
for get_vectored_concurrent_io in sequential sidecar-task; do
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
for io_mode in buffered direct direct-rw ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
|
||||
${cov_prefix} \
|
||||
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
|
||||
${cov_prefix} \
|
||||
cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
done
|
||||
|
||||
@@ -395,7 +392,6 @@ jobs:
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
|
||||
11
.github/workflows/_create-release-pr.yml
vendored
11
.github/workflows/_create-release-pr.yml
vendored
@@ -53,13 +53,10 @@ jobs:
|
||||
|| inputs.component-name == 'Compute' && 'release-compute'
|
||||
}}
|
||||
run: |
|
||||
now_date=$(date -u +'%Y-%m-%d')
|
||||
now_time=$(date -u +'%H-%M-%Z')
|
||||
{
|
||||
echo "title=${COMPONENT_NAME} release ${now_date}"
|
||||
echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
|
||||
echo "release-branch=${RELEASE_BRANCH}"
|
||||
} | tee -a ${GITHUB_OUTPUT}
|
||||
today=$(date +'%Y-%m-%d')
|
||||
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
|
||||
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
|
||||
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Configure git
|
||||
run: |
|
||||
|
||||
6
.github/workflows/build_and_test.yml
vendored
6
.github/workflows/build_and_test.yml
vendored
@@ -284,7 +284,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
runs-on: [ self-hosted, small-metal ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
@@ -323,8 +323,6 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
|
||||
SYNC_BETWEEN_TESTS: true
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
@@ -1273,7 +1271,7 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
|
||||
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
|
||||
permissions:
|
||||
|
||||
4
.github/workflows/pg-clients.yml
vendored
4
.github/workflows/pg-clients.yml
vendored
@@ -30,7 +30,7 @@ permissions:
|
||||
statuses: write # require for posting a status update
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 17
|
||||
DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: neon-captest-new
|
||||
AWS_DEFAULT_REGION: eu-central-1
|
||||
|
||||
@@ -42,8 +42,6 @@ jobs:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
|
||||
build-build-tools-image:
|
||||
permissions:
|
||||
packages: write
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
29
Cargo.lock
generated
29
Cargo.lock
generated
@@ -1416,7 +1416,6 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.13.1",
|
||||
"camino",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
@@ -1426,12 +1425,10 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"jsonwebtoken",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pem",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"regex",
|
||||
@@ -1440,8 +1437,6 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"spki 0.7.3",
|
||||
"storage_broker",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
@@ -2822,7 +2817,6 @@ dependencies = [
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pprof",
|
||||
@@ -2843,7 +2837,6 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4251,7 +4244,6 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"base64 0.13.1",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
@@ -4276,7 +4268,6 @@ dependencies = [
|
||||
"hyper 0.14.30",
|
||||
"indoc",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -4299,7 +4290,6 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
"remote_keys",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rpds",
|
||||
@@ -4354,7 +4344,6 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"itertools 0.10.5",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
@@ -5505,16 +5494,6 @@ version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c707298afce11da2efef2f600116fa93ffa7a032b5d7b628aa17711ec81383ca"
|
||||
|
||||
[[package]]
|
||||
name = "remote_keys"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"rand 0.8.5",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
@@ -5530,7 +5509,6 @@ dependencies = [
|
||||
"azure_identity",
|
||||
"azure_storage",
|
||||
"azure_storage_blobs",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -5541,7 +5519,6 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper 1.4.1",
|
||||
"itertools 0.10.5",
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
@@ -5707,9 +5684,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
version = "0.17.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
|
||||
checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
@@ -6010,7 +5987,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -7895,7 +7871,6 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres_connection",
|
||||
"pprof",
|
||||
|
||||
@@ -30,7 +30,6 @@ members = [
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_keys",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
@@ -142,7 +141,6 @@ parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -176,7 +174,6 @@ signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
spki = "0.7.3"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
"subtle" = "2.5.0"
|
||||
@@ -256,7 +253,6 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
postgres_initdb = { path = "./libs/postgres_initdb" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
remote_keys = { version = "0.1", path = "./libs/remote_keys/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
safekeeper_client = { path = "./safekeeper/client" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
|
||||
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -12,5 +12,3 @@ disallowed-macros = [
|
||||
# cannot disallow this, because clippy finds used from tokio macros
|
||||
#"tokio::pin",
|
||||
]
|
||||
|
||||
allow-unwrap-in-tests = true
|
||||
|
||||
@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
|
||||
HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index b667478..1298aa1 100644
|
||||
index b667478..dc95d89 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
@@ -36,7 +36,7 @@ index b667478..1298aa1 100644
|
||||
/* Close relations within worker */
|
||||
index_close(indexRel, indexLockmode);
|
||||
table_close(heapRel, heapLockmode);
|
||||
@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
@@ -48,17 +48,32 @@ index b667478..1298aa1 100644
|
||||
|
||||
BuildGraph(buildstate, forkNum);
|
||||
|
||||
- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
|
||||
+ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+ if (set_lwlsn_block_range_hook)
|
||||
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ if (set_lwlsn_relation_hook)
|
||||
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+ }
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
|
||||
FreeBuildState(buildstate);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
diff --git a/src/ruminsert.c b/src/ruminsert.c
|
||||
index 255e616..1c6edb7 100644
|
||||
index 255e616..7a2240f 100644
|
||||
--- a/src/ruminsert.c
|
||||
+++ b/src/ruminsert.c
|
||||
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
@@ -24,12 +24,24 @@ index 255e616..1c6edb7 100644
|
||||
/*
|
||||
* Write index to xlog
|
||||
*/
|
||||
@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
UnlockReleaseBuffer(buffer);
|
||||
}
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+ if (set_lwlsn_block_range_hook)
|
||||
+ set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ if (set_lwlsn_relation_hook)
|
||||
+ set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -29,12 +29,13 @@
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -c /var/db/postgres/configs/config.json \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
@@ -42,7 +43,8 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use compute_tools::compute::{
|
||||
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
|
||||
};
|
||||
@@ -57,13 +59,24 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
fn parse_remote_ext_config(arg: &str) -> Result<String> {
|
||||
if arg.starts_with("http") {
|
||||
Ok(arg.trim_end_matches('/').to_string())
|
||||
} else {
|
||||
Ok("http://pg-ext-s3-gateway".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
#[arg(short = 'r', long)]
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
|
||||
pub remote_ext_config: Option<String>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
@@ -105,8 +118,8 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
pub set_disk_quota_for_fs: Option<String>,
|
||||
|
||||
#[arg(short = 'c', long)]
|
||||
pub config: Option<OsString>,
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id")]
|
||||
pub compute_id: String,
|
||||
@@ -114,9 +127,8 @@ struct Cli {
|
||||
#[arg(
|
||||
short = 'p',
|
||||
long,
|
||||
conflicts_with = "config",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL",
|
||||
requires = "compute-id"
|
||||
conflicts_with = "spec-path",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL"
|
||||
)]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
@@ -126,7 +138,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// For historical reasons, the main thread that processes the config and launches postgres
|
||||
// For historical reasons, the main thread that processes the spec and launches postgres
|
||||
// is synchronous, but we always have this tokio runtime available and we "enter" it so
|
||||
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
|
||||
// from all parts of compute_ctl.
|
||||
@@ -142,7 +154,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let config = get_config(&cli)?;
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
@@ -163,7 +175,8 @@ fn main() -> Result<()> {
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
},
|
||||
config,
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
)?;
|
||||
|
||||
let exit_code = compute_node.run()?;
|
||||
@@ -188,17 +201,27 @@ async fn init() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
|
||||
// First, read the config from the path if provided
|
||||
if let Some(ref config) = cli.config {
|
||||
let file = File::open(config)?;
|
||||
return Ok(serde_json::from_reader(&file)?);
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, read spec from the path if provided
|
||||
if let Some(ref spec_path) = cli.spec_path {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
});
|
||||
}
|
||||
|
||||
// If the config wasn't provided in the CLI arguments, then retrieve it from
|
||||
if cli.control_plane_uri.is_none() {
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
// If the spec wasn't provided in the CLI arguments, then retrieve it from
|
||||
// the control plane
|
||||
match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(config) => Ok(config),
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(resp) => Ok(CliSpecParams {
|
||||
spec: resp.0,
|
||||
compute_ctl_config: resp.1,
|
||||
}),
|
||||
Err(e) => {
|
||||
error!(
|
||||
"cannot get response from control plane: {}\n\
|
||||
@@ -210,6 +233,13 @@ fn get_config(cli: &Cli) -> Result<ComputeConfig> {
|
||||
}
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
#[allow(dead_code)]
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::{env, fs};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
};
|
||||
@@ -303,7 +303,11 @@ struct StartVmMonitorResult {
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
|
||||
pub fn new(
|
||||
params: ComputeNodeParams,
|
||||
cli_spec: Option<ComputeSpec>,
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
) -> Result<Self> {
|
||||
let connstr = params.connstr.as_str();
|
||||
let conn_conf = postgres::config::Config::from_str(connstr)
|
||||
.context("cannot build postgres config from connstr")?;
|
||||
@@ -311,8 +315,8 @@ impl ComputeNode {
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
if let Some(cli_spec) = cli_spec {
|
||||
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -323,7 +327,7 @@ impl ComputeNode {
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
compute_ctl_config: config.compute_ctl_config,
|
||||
compute_ctl_config,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -519,14 +523,11 @@ impl ComputeNode {
|
||||
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
pspec.tenant_id,
|
||||
pspec.timeline_id,
|
||||
pspec.spec.project_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.branch_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.features,
|
||||
pspec.spec.remote_extensions,
|
||||
);
|
||||
@@ -630,23 +631,19 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for compliance audit logging
|
||||
match pspec.spec.audit_log_level {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
let remote_endpoint =
|
||||
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
// Configure and start rsyslog for HIPAA if necessary
|
||||
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
|
||||
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
}
|
||||
_ => {}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for Postgres logs export
|
||||
|
||||
@@ -89,15 +89,6 @@ pub fn write_postgres_conf(
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.project_id {
|
||||
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.branch_id {
|
||||
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.endpoint_id {
|
||||
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
// tls
|
||||
if let Some(tls_config) = tls_config {
|
||||
@@ -178,7 +169,7 @@ pub fn write_postgres_conf(
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
match spec.audit_log_level {
|
||||
ComputeAudit::Disabled => {}
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
ComputeAudit::Log => {
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
|
||||
writeln!(file, "pgaudit.log='ddl,role'")?;
|
||||
// Disable logging of catalog queries to reduce the noise
|
||||
@@ -202,20 +193,16 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
|
||||
}
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
ComputeAudit::Hipaa => {
|
||||
writeln!(
|
||||
file,
|
||||
"# Managed by compute_ctl compliance audit settings: begin"
|
||||
)?;
|
||||
// Enable logging of parameters.
|
||||
// This is very verbose and may contain sensitive data.
|
||||
if spec.audit_log_level == ComputeAudit::Full {
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
} else {
|
||||
writeln!(file, "pgaudit.log_parameter=off")?;
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
}
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
// Exclude 'misc' category, because it doesn't contain anythig relevant.
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
|
||||
@@ -6,5 +6,4 @@ pub(crate) mod request_id;
|
||||
pub(crate) use json::Json;
|
||||
pub(crate) use path::Path;
|
||||
pub(crate) use query::Query;
|
||||
#[allow(unused)]
|
||||
pub(crate) use request_id::RequestId;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::HashSet;
|
||||
use std::{collections::HashSet, net::SocketAddr};
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use axum::{RequestExt, body::Body};
|
||||
use axum::{RequestExt, body::Body, extract::ConnectInfo};
|
||||
use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
@@ -11,9 +11,9 @@ use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
@@ -52,6 +52,31 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
let validation = self.validation.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
let request_id = request.extract_parts::<RequestId>().await.unwrap();
|
||||
|
||||
// TODO: Remove this stanza after teaching neon_local and the
|
||||
// regression tests to use a JWT + JWKS.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/11316
|
||||
if cfg!(feature = "testing") {
|
||||
warn!(%request_id, "Skipping compute_ctl authorization check");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let connect_info = request
|
||||
.extract_parts::<ConnectInfo<SocketAddr>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// In the event the request is coming from the loopback interface,
|
||||
// allow all requests
|
||||
if connect_info.ip().is_loopback() {
|
||||
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let TypedHeader(Authorization(bearer)) = request
|
||||
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
@@ -67,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
if data.claims.compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid compute ID in authorization token claims",
|
||||
"invalid claims in authorization token",
|
||||
));
|
||||
}
|
||||
|
||||
@@ -87,16 +112,12 @@ impl Authorize {
|
||||
token: &str,
|
||||
validation: &Validation,
|
||||
) -> Result<TokenData<ComputeClaims>> {
|
||||
debug_assert!(!jwks.keys.is_empty());
|
||||
|
||||
debug!("verifying token {}", token);
|
||||
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to construct decoding key from {}: {}",
|
||||
"Failed to construct decoding key from {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
@@ -109,7 +130,7 @@ impl Authorize {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to decode authorization token using {}: {}",
|
||||
"Failed to decode authorization token using {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
@@ -119,6 +140,6 @@ impl Authorize {
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!("failed to verify authorization token"))
|
||||
Err(anyhow!("Failed to verify authorization token"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
|
||||
// And it's fair to call it a 'RPC' (Remote Procedure Call).
|
||||
pub enum CPlaneRequestRPC {
|
||||
GetConfig,
|
||||
GetSpec,
|
||||
}
|
||||
|
||||
impl CPlaneRequestRPC {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
CPlaneRequestRPC::GetConfig => "GetConfig",
|
||||
CPlaneRequestRPC::GetSpec => "GetSpec",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,8 +3,9 @@ use std::path::Path;
|
||||
|
||||
use anyhow::{Result, anyhow, bail};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
|
||||
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
|
||||
};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use reqwest::StatusCode;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument};
|
||||
@@ -20,7 +21,7 @@ use crate::params::PG_HBA_ALL_MD5;
|
||||
fn do_control_plane_request(
|
||||
uri: &str,
|
||||
jwt: &str,
|
||||
) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
@@ -28,14 +29,14 @@ fn do_control_plane_request(
|
||||
.map_err(|e| {
|
||||
(
|
||||
true,
|
||||
format!("could not perform request to control plane: {:?}", e),
|
||||
format!("could not perform spec request to control plane: {:?}", e),
|
||||
UNKNOWN_HTTP_STATUS.to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
let status = resp.status();
|
||||
match status {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
|
||||
Ok(spec_resp) => Ok(spec_resp),
|
||||
Err(e) => Err((
|
||||
true,
|
||||
@@ -68,35 +69,40 @@ fn do_control_plane_request(
|
||||
}
|
||||
}
|
||||
|
||||
/// Request config from the control-plane by compute_id. If
|
||||
/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
|
||||
/// authorization.
|
||||
pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
|
||||
/// env variable is set, it will be used for authorization.
|
||||
pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
let mut attempt = 1;
|
||||
|
||||
info!("getting config from control plane: {}", cp_uri);
|
||||
info!("getting spec from control plane: {}", cp_uri);
|
||||
|
||||
// Do 3 attempts to get spec from the control plane using the following logic:
|
||||
// - network error -> then retry
|
||||
// - compute id is unknown or any other error -> bail out
|
||||
// - no spec for compute yet (Empty state) -> return Ok(None)
|
||||
// - got config -> return Ok(Some(config))
|
||||
// - got spec -> return Ok(Some(spec))
|
||||
while attempt < 4 {
|
||||
let result = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
Ok(config_resp) => {
|
||||
Ok(spec_resp) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[
|
||||
CPlaneRequestRPC::GetConfig.as_str(),
|
||||
CPlaneRequestRPC::GetSpec.as_str(),
|
||||
&StatusCode::OK.to_string(),
|
||||
])
|
||||
.inc();
|
||||
match config_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
|
||||
match spec_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
|
||||
ControlPlaneComputeStatus::Attached => {
|
||||
if config_resp.spec.is_some() {
|
||||
Ok(config_resp.into())
|
||||
if let Some(spec) = spec_resp.spec {
|
||||
Ok((Some(spec), spec_resp.compute_ctl_config))
|
||||
} else {
|
||||
bail!("compute is attached, but spec is empty")
|
||||
}
|
||||
@@ -105,7 +111,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
|
||||
}
|
||||
Err((retry, msg, status)) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
|
||||
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
|
||||
.inc();
|
||||
if retry {
|
||||
Err(anyhow!(msg))
|
||||
@@ -116,7 +122,7 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
|
||||
};
|
||||
|
||||
if let Err(e) = &result {
|
||||
error!("attempt {} to get config failed with: {}", attempt, e);
|
||||
error!("attempt {} to get spec failed with: {}", attempt, e);
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
@@ -127,13 +133,13 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
|
||||
|
||||
// All attempts failed, return error.
|
||||
Err(anyhow::anyhow!(
|
||||
"Exhausted all attempts to retrieve the config from the control plane"
|
||||
"Exhausted all attempts to retrieve the spec from the control plane"
|
||||
))
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of config.json
|
||||
// XXX: consider making it a part of spec.json
|
||||
let pghba_path = pgdata_path.join("pg_hba.conf");
|
||||
|
||||
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
|
||||
@@ -147,7 +153,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Create a standby.signal file
|
||||
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of config.json
|
||||
// XXX: consider making it a part of spec.json
|
||||
let signalfile = pgdata_path.join("standby.signal");
|
||||
|
||||
if !signalfile.exists() {
|
||||
|
||||
@@ -278,12 +278,12 @@ impl ComputeNode {
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
ComputeAudit::Hipaa => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
ComputeAudit::Log => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
|
||||
@@ -6,16 +6,13 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
base64.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
humantime.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pem.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper0.workspace = true
|
||||
regex.workspace = true
|
||||
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2.workspace = true
|
||||
spki.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
@@ -63,7 +63,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: u32 = 17;
|
||||
const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
@@ -552,7 +552,6 @@ enum EndpointCmd {
|
||||
Start(EndpointStartCmdArgs),
|
||||
Reconfigure(EndpointReconfigureCmdArgs),
|
||||
Stop(EndpointStopCmdArgs),
|
||||
GenerateJwt(EndpointGenerateJwtCmdArgs),
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -700,13 +699,6 @@ struct EndpointStopCmdArgs {
|
||||
mode: String,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Generate a JWT for an endpoint")]
|
||||
struct EndpointGenerateJwtCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
endpoint_id: String,
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
#[clap(about = "Manage neon_local branch name mappings")]
|
||||
enum MappingsCmd {
|
||||
@@ -1536,16 +1528,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
endpoint.stop(&args.mode, args.destroy)?;
|
||||
}
|
||||
EndpointCmd::GenerateJwt(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id)
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let jwt = endpoint.generate_jwt()?;
|
||||
|
||||
print!("{jwt}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
//! postgresql.conf - postgresql settings
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! spec.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
@@ -42,30 +42,20 @@ use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::{ComputeClaims, ConfigurationRequest};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
|
||||
};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
};
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
|
||||
};
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pem::Pem;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use spki::der::Decode;
|
||||
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
@@ -90,7 +80,6 @@ pub struct EndpointConf {
|
||||
drop_subscriptions_before_start: bool,
|
||||
features: Vec<ComputeFeature>,
|
||||
cluster: Option<Cluster>,
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -146,37 +135,6 @@ impl ComputeControlPlane {
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
/// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
|
||||
/// from the production control plane.
|
||||
fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
|
||||
let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
|
||||
let public_key = spki.subject_public_key.raw_bytes();
|
||||
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(public_key);
|
||||
let key_hash = hasher.finalize();
|
||||
|
||||
Ok(JwkSet {
|
||||
keys: vec![Jwk {
|
||||
common: CommonParameters {
|
||||
public_key_use: Some(PublicKeyUse::Signature),
|
||||
key_operations: Some(vec![KeyOperations::Verify]),
|
||||
key_algorithm: Some(KeyAlgorithm::EdDSA),
|
||||
key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
|
||||
x509_url: None::<String>,
|
||||
x509_chain: None::<Vec<String>>,
|
||||
x509_sha1_fingerprint: None::<String>,
|
||||
x509_sha256_fingerprint: None::<String>,
|
||||
},
|
||||
algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
|
||||
key_type: OctetKeyPairType::OctetKeyPair,
|
||||
curve: EllipticCurve::Ed25519,
|
||||
x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
|
||||
}),
|
||||
}],
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new_endpoint(
|
||||
&mut self,
|
||||
@@ -194,10 +152,6 @@ impl ComputeControlPlane {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
|
||||
let compute_ctl_config = ComputeCtlConfig {
|
||||
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
|
||||
tls: None::<TlsConfig>,
|
||||
};
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
|
||||
@@ -225,7 +179,6 @@ impl ComputeControlPlane {
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
compute_ctl_config: compute_ctl_config.clone(),
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -245,7 +198,6 @@ impl ComputeControlPlane {
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
cluster: None,
|
||||
compute_ctl_config,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -288,6 +240,7 @@ impl ComputeControlPlane {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Endpoint {
|
||||
/// used as the directory name
|
||||
endpoint_id: String,
|
||||
@@ -316,9 +269,6 @@ pub struct Endpoint {
|
||||
features: Vec<ComputeFeature>,
|
||||
// Cluster settings
|
||||
cluster: Option<Cluster>,
|
||||
|
||||
/// The compute_ctl config for the endpoint's compute.
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
@@ -381,7 +331,6 @@ impl Endpoint {
|
||||
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
|
||||
features: conf.features,
|
||||
cluster: conf.cluster,
|
||||
compute_ctl_config: conf.compute_ctl_config,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -629,13 +578,6 @@ impl Endpoint {
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
/// Generate a JWT with the correct claims.
|
||||
pub fn generate_jwt(&self) -> Result<String> {
|
||||
self.env.generate_auth_token(&ComputeClaims {
|
||||
compute_id: self.endpoint_id.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start(
|
||||
&self,
|
||||
@@ -677,97 +619,87 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
project_id: None,
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
// Create spec file
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
ComputeConfig {
|
||||
spec: Some(spec),
|
||||
compute_ctl_config: self.compute_ctl_config.clone(),
|
||||
}
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
};
|
||||
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
|
||||
let logfile = std::fs::OpenOptions::new()
|
||||
@@ -793,8 +725,10 @@ impl Endpoint {
|
||||
])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.arg("--config")
|
||||
.arg(self.endpoint_path().join("config.json").as_os_str())
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
self.env
|
||||
@@ -805,7 +739,16 @@ impl Endpoint {
|
||||
])
|
||||
// TODO: It would be nice if we generated compute IDs with the same
|
||||
// algorithm as the real control plane.
|
||||
.args(["--compute-id", &self.endpoint_id])
|
||||
.args([
|
||||
"--compute-id",
|
||||
&format!(
|
||||
"compute-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
),
|
||||
])
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
@@ -903,7 +846,6 @@ impl Endpoint {
|
||||
self.external_http_address.port()
|
||||
),
|
||||
)
|
||||
.bearer_auth(self.generate_jwt()?)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
@@ -928,12 +870,10 @@ impl Endpoint {
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
let config: ComputeConfig = serde_json::from_reader(file)?;
|
||||
|
||||
(config.spec.unwrap(), config.compute_ctl_config)
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
@@ -980,11 +920,10 @@ impl Endpoint {
|
||||
self.external_http_address.port()
|
||||
))
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.bearer_auth(self.generate_jwt()?)
|
||||
.body(
|
||||
serde_json::to_string(&ConfigurationRequest {
|
||||
spec,
|
||||
compute_ctl_config,
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
|
||||
@@ -12,7 +12,6 @@ use std::{env, fs};
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::ValueEnum;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -23,7 +22,7 @@ use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
|
||||
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 17;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
@@ -57,7 +56,6 @@ pub struct LocalEnv {
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
pub private_key_path: PathBuf,
|
||||
/// Path to environment's public key
|
||||
pub public_key_path: PathBuf,
|
||||
|
||||
pub broker: NeonBroker,
|
||||
@@ -760,11 +758,11 @@ impl LocalEnv {
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
|
||||
let key = self.read_private_key()?;
|
||||
encode_from_key_file(claims, &key)
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
|
||||
/// Get the path to the private key.
|
||||
pub fn get_private_key_path(&self) -> PathBuf {
|
||||
if self.private_key_path.is_absolute() {
|
||||
self.private_key_path.to_path_buf()
|
||||
@@ -773,29 +771,6 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the path to the public key.
|
||||
pub fn get_public_key_path(&self) -> PathBuf {
|
||||
if self.public_key_path.is_absolute() {
|
||||
self.public_key_path.to_path_buf()
|
||||
} else {
|
||||
self.base_data_dir.join(&self.public_key_path)
|
||||
}
|
||||
}
|
||||
|
||||
/// Read the contents of the private key file.
|
||||
pub fn read_private_key(&self) -> anyhow::Result<Pem> {
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let pem = pem::parse(fs::read(private_key_path)?)?;
|
||||
Ok(pem)
|
||||
}
|
||||
|
||||
/// Read the contents of the public key file.
|
||||
pub fn read_public_key(&self) -> anyhow::Result<Pem> {
|
||||
let public_key_path = self.get_public_key_path();
|
||||
let pem = pem::parse(fs::read(public_key_path)?)?;
|
||||
Ok(pem)
|
||||
}
|
||||
|
||||
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
|
||||
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
|
||||
let base_path = base_path();
|
||||
@@ -981,7 +956,6 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// Extract the public key from the private key file
|
||||
//
|
||||
// openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
|
||||
@@ -998,7 +972,6 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1007,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
|
||||
// -out rootCA.crt -keyout rootCA.key
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args([
|
||||
"req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
|
||||
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
|
||||
])
|
||||
.args(["-subj", "/CN=Neon Local CA"])
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
@@ -1037,7 +1010,7 @@ fn generate_ssl_cert(
|
||||
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["req", "-new", "-nodes"])
|
||||
.args(["-newkey", "ed25519"])
|
||||
.args(["-newkey", "rsa:2048"])
|
||||
.args(["-subj", "/CN=localhost"])
|
||||
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
|
||||
@@ -318,7 +318,7 @@ impl PageServerNode {
|
||||
self.conf.id, datadir,
|
||||
)
|
||||
})?;
|
||||
let args = vec!["-D", datadir_path_str];
|
||||
let args = vec!["-D", datadir_path_str, "--dev"];
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
@@ -413,11 +413,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
compaction_shard_ancestor: settings
|
||||
.remove("compaction_shard_ancestor")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
|
||||
compaction_l0_first: settings
|
||||
.remove("compaction_l0_first")
|
||||
.map(|x| x.parse::<bool>())
|
||||
@@ -540,11 +535,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
|
||||
gc_compaction_verification: settings
|
||||
.remove("gc_compaction_verification")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_verification' as bool")?,
|
||||
gc_compaction_initial_threshold_kb: settings
|
||||
.remove("gc_compaction_initial_threshold_kb")
|
||||
.map(|x| x.parse::<u64>())
|
||||
|
||||
@@ -162,6 +162,7 @@ impl SafekeeperNode {
|
||||
listen_http,
|
||||
"--availability-zone".to_owned(),
|
||||
availability_zone,
|
||||
"--dev".to_owned(),
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||
|
||||
@@ -13,12 +13,9 @@ use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::{Certificate, Method};
|
||||
use serde::de::DeserializeOwned;
|
||||
@@ -35,8 +32,8 @@ use crate::local_env::{LocalEnv, NeonStorageControllerConf};
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
private_key: Option<Pem>,
|
||||
public_key: Option<Pem>,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
|
||||
@@ -85,8 +82,7 @@ impl NeonStorageControllerStopArgs {
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
pub generation_override: Option<i32>, // only new tenants
|
||||
pub config: Option<TenantConfig>, // only new tenants
|
||||
pub generation_override: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -117,9 +113,7 @@ impl StorageController {
|
||||
AuthType::Trust => (None, None),
|
||||
AuthType::NeonJWT => {
|
||||
let private_key_path = env.get_private_key_path();
|
||||
let private_key =
|
||||
pem::parse(fs::read(private_key_path).expect("failed to read private key"))
|
||||
.expect("failed to parse PEM file");
|
||||
let private_key = fs::read(private_key_path).expect("failed to read private key");
|
||||
|
||||
// If pageserver auth is enabled, this implicitly enables auth for this service,
|
||||
// using the same credentials.
|
||||
@@ -141,13 +135,9 @@ impl StorageController {
|
||||
.expect("Empty key dir")
|
||||
.expect("Error reading key dir");
|
||||
|
||||
pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key"))
|
||||
.expect("Failed to parse PEM file")
|
||||
std::fs::read_to_string(dent.path()).expect("Can't read public key")
|
||||
} else {
|
||||
pem::parse(
|
||||
std::fs::read_to_string(&public_key_path).expect("Can't read public key"),
|
||||
)
|
||||
.expect("Failed to parse PEM file")
|
||||
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
|
||||
};
|
||||
(Some(private_key), Some(public_key))
|
||||
}
|
||||
@@ -815,7 +805,6 @@ impl StorageController {
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
generation_override: None,
|
||||
config: None,
|
||||
};
|
||||
|
||||
let response = self
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
@@ -7,13 +8,3 @@ you can experiment with a miniature Neon system, use `cargo neon` rather than co
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
## Generating the JWKS for a compute
|
||||
|
||||
```shell
|
||||
openssl genpkey -algorithm Ed25519 -out private-key.pem
|
||||
openssl pkey -in private-key.pem -pubout -out public-key.pem
|
||||
openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der
|
||||
key="$(xxd -plain -cols 32 -s -32 public-key.der)"
|
||||
key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)"
|
||||
x="$(printf '%s' "$key" | basenc --base64url --wrap=0)"
|
||||
```
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma
|
||||
-----END PRIVATE KEY-----
|
||||
Binary file not shown.
@@ -1,3 +0,0 @@
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0=
|
||||
-----END PUBLIC KEY-----
|
||||
@@ -11,8 +11,8 @@ generate_id() {
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
|
||||
SPEC_FILE=/tmp/spec.json
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
@@ -73,17 +73,17 @@ else
|
||||
ulid_extension=ulid
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
|
||||
cat ${CONFIG_FILE}
|
||||
cat ${SPEC_FILE}
|
||||
|
||||
echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
--config "$CONFIG_FILE"
|
||||
-S ${SPEC_FILE}
|
||||
|
||||
@@ -1,160 +0,0 @@
|
||||
{
|
||||
"spec": {
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
},
|
||||
"compute_ctl_config": {
|
||||
"jwks": {
|
||||
"keys": [
|
||||
{
|
||||
"use": "sig",
|
||||
"key_ops": [
|
||||
"verify"
|
||||
],
|
||||
"alg": "EdDSA",
|
||||
"kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=",
|
||||
"kty": "OKP",
|
||||
"crv": "Ed25519",
|
||||
"x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo="
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
141
docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
Normal file
141
docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
Normal file
@@ -0,0 +1,141 @@
|
||||
{
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
}
|
||||
@@ -159,7 +159,7 @@ services:
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
- ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
|
||||
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute_wrapper/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
EXTENSION = pg_jsonschema
|
||||
DATA = pg_jsonschema--1.0.sql
|
||||
REGRESS = jsonschema_valid_api jsonschema_edge_cases
|
||||
REGRESS_OPTS = --load-extension=pg_jsonschema
|
||||
|
||||
PG_CONFIG ?= pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
@@ -1,87 +0,0 @@
|
||||
-- Schema with enums, nulls, extra properties disallowed
|
||||
SELECT jsonschema_is_valid('{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json);
|
||||
jsonschema_is_valid
|
||||
---------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- Valid enum and null email
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "email": null}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
------------------------------
|
||||
{}
|
||||
(1 row)
|
||||
|
||||
-- Invalid enum value
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "disabled", "email": null}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
----------------------------------------------------------------------
|
||||
{"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
|
||||
(1 row)
|
||||
|
||||
-- Invalid email format (assuming format is validated)
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "email": "not-an-email"}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
-----------------------------------------
|
||||
{"\"not-an-email\" is not a \"email\""}
|
||||
(1 row)
|
||||
|
||||
-- Extra property not allowed
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "extra": "should not be here"}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
--------------------------------------------------------------------
|
||||
{"Additional properties are not allowed ('extra' was unexpected)"}
|
||||
(1 row)
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
-- Define schema
|
||||
SELECT jsonschema_is_valid('{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json);
|
||||
jsonschema_is_valid
|
||||
---------------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- Valid instance
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"username": "alice", "age": 25}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
------------------------------
|
||||
{}
|
||||
(1 row)
|
||||
|
||||
-- Invalid instance: missing required "username"
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"age": 25}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
-----------------------------------------
|
||||
{"\"username\" is a required property"}
|
||||
(1 row)
|
||||
|
||||
-- Invalid instance: wrong type for "age"
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"username": "bob", "age": "twenty"}'::json
|
||||
);
|
||||
jsonschema_validation_errors
|
||||
-------------------------------------------
|
||||
{"\"twenty\" is not of type \"integer\""}
|
||||
(1 row)
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
-- Schema with enums, nulls, extra properties disallowed
|
||||
SELECT jsonschema_is_valid('{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json);
|
||||
|
||||
-- Valid enum and null email
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "email": null}'::json
|
||||
);
|
||||
|
||||
-- Invalid enum value
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "disabled", "email": null}'::json
|
||||
);
|
||||
|
||||
-- Invalid email format (assuming format is validated)
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "email": "not-an-email"}'::json
|
||||
);
|
||||
|
||||
-- Extra property not allowed
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": { "type": "string", "enum": ["active", "inactive", "pending"] },
|
||||
"email": { "type": ["string", "null"], "format": "email" }
|
||||
},
|
||||
"required": ["status"],
|
||||
"additionalProperties": false
|
||||
}'::json,
|
||||
'{"status": "active", "extra": "should not be here"}'::json
|
||||
);
|
||||
@@ -1,48 +0,0 @@
|
||||
-- Define schema
|
||||
SELECT jsonschema_is_valid('{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json);
|
||||
|
||||
-- Valid instance
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"username": "alice", "age": 25}'::json
|
||||
);
|
||||
|
||||
-- Invalid instance: missing required "username"
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"age": 25}'::json
|
||||
);
|
||||
|
||||
-- Invalid instance: wrong type for "age"
|
||||
SELECT jsonschema_validation_errors(
|
||||
'{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"username": { "type": "string" },
|
||||
"age": { "type": "integer" }
|
||||
},
|
||||
"required": ["username"]
|
||||
}'::json,
|
||||
'{"username": "bob", "age": "twenty"}'::json
|
||||
);
|
||||
@@ -1,9 +0,0 @@
|
||||
EXTENSION = pg_session_jwt
|
||||
|
||||
REGRESS = basic_functions
|
||||
REGRESS_OPTS = --load-extension=$(EXTENSION)
|
||||
export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
|
||||
|
||||
PG_CONFIG ?= pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
@@ -1,35 +0,0 @@
|
||||
-- Basic functionality tests for pg_session_jwt
|
||||
-- Test auth.init() function
|
||||
SELECT auth.init();
|
||||
init
|
||||
------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Test an invalid JWT
|
||||
SELECT auth.jwt_session_init('INVALID-JWT');
|
||||
ERROR: invalid JWT encoding
|
||||
-- Test creating a session with an expired JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
|
||||
ERROR: Token used after it has expired
|
||||
-- Test creating a session with a valid JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
|
||||
jwt_session_init
|
||||
------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Test auth.session() function
|
||||
SELECT auth.session();
|
||||
session
|
||||
-------------------------------------------------------------------------
|
||||
{"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
|
||||
(1 row)
|
||||
|
||||
-- Test auth.user_id() function
|
||||
SELECT auth.user_id() AS user_id;
|
||||
user_id
|
||||
---------
|
||||
user123
|
||||
(1 row)
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
-- Basic functionality tests for pg_session_jwt
|
||||
|
||||
-- Test auth.init() function
|
||||
SELECT auth.init();
|
||||
|
||||
-- Test an invalid JWT
|
||||
SELECT auth.jwt_session_init('INVALID-JWT');
|
||||
|
||||
-- Test creating a session with an expired JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
|
||||
|
||||
-- Test creating a session with a valid JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
|
||||
|
||||
-- Test auth.session() function
|
||||
SELECT auth.session();
|
||||
|
||||
-- Test auth.user_id() function
|
||||
SELECT auth.user_id() AS user_id;
|
||||
@@ -14,32 +14,6 @@ pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// All configuration parameters necessary for a compute. When
|
||||
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
|
||||
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
|
||||
/// and contains parameters necessary for operating `compute_ctl` independently
|
||||
/// of whether a tenant is attached to the compute or not.
|
||||
///
|
||||
/// This also happens to be the body of `compute_ctl`'s /configure request.
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeConfig {
|
||||
/// The compute spec
|
||||
pub spec: Option<ComputeSpec>,
|
||||
|
||||
/// The compute_ctl configuration
|
||||
#[allow(dead_code)]
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
impl From<ControlPlaneConfigResponse> for ComputeConfig {
|
||||
fn from(value: ControlPlaneConfigResponse) -> Self {
|
||||
Self {
|
||||
spec: value.spec,
|
||||
compute_ctl_config: value.compute_ctl_config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ExtensionInstallResponse {
|
||||
pub extension: PgIdent,
|
||||
@@ -160,7 +134,7 @@ pub struct CatalogObjects {
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeCtlConfig {
|
||||
/// Set of JSON web keys that the compute can use to authenticate
|
||||
/// communication from the control plane.
|
||||
@@ -179,7 +153,7 @@ impl Default for ComputeCtlConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct TlsConfig {
|
||||
pub key_path: String,
|
||||
pub cert_path: String,
|
||||
@@ -187,7 +161,7 @@ pub struct TlsConfig {
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneConfigResponse {
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
//! The ComputeSpec contains all the information needed to start up
|
||||
//! the right version of PostgreSQL, and connect it to the storage nodes.
|
||||
//! It can be passed as part of the `config.json`, or the control plane can
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
//! `ComputeSpec` represents the contents of the spec.json file.
|
||||
//!
|
||||
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
|
||||
//! all the information needed to start up the right version of PostgreSQL,
|
||||
//! and connect it to the storage nodes.
|
||||
use std::collections::HashMap;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
@@ -104,12 +104,6 @@ pub struct ComputeSpec {
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
pub branch_id: Option<String>,
|
||||
pub endpoint_id: Option<String>,
|
||||
|
||||
/// Safekeeper membership config generation. It is put in
|
||||
/// neon.safekeepers GUC and serves two purposes:
|
||||
/// 1) Non zero value forces walproposer to use membership configurations.
|
||||
@@ -165,7 +159,13 @@ pub struct ComputeSpec {
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
|
||||
/// Log level for compute audit logging
|
||||
/// Log level for audit logging:
|
||||
///
|
||||
/// Disabled - no audit logging. This is the default.
|
||||
/// log - log masked statements to the postgres log using pgaudit extension
|
||||
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
|
||||
///
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
|
||||
@@ -289,25 +289,14 @@ impl ComputeMode {
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
/// Disabled, log, hipaa
|
||||
/// Default is Disabled
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
#[default]
|
||||
Disabled,
|
||||
// Deprecated, use Base instead
|
||||
Log,
|
||||
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
|
||||
// logged to the standard postgresql log stream
|
||||
Base,
|
||||
// Deprecated, use Full or Extended instead
|
||||
Hipaa,
|
||||
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access
|
||||
Extended,
|
||||
// (pgaudit.log='all', pgaudit.log_parameter='on'),
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access.
|
||||
Full,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
|
||||
@@ -14,7 +14,6 @@ futures.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jemalloc_pprof.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
once_cell.workspace = true
|
||||
pprof.workspace = true
|
||||
regex.workspace = true
|
||||
@@ -31,7 +30,6 @@ tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
|
||||
@@ -8,7 +8,6 @@ use bytes::{Bytes, BytesMut};
|
||||
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::{Body, Method, Request, Response};
|
||||
use jsonwebtoken::TokenData;
|
||||
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
|
||||
use once_cell::sync::Lazy;
|
||||
use pprof::ProfilerGuardBuilder;
|
||||
@@ -619,7 +618,7 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
})?;
|
||||
let token = parse_token(header_value)?;
|
||||
|
||||
let data: TokenData<Claims> = auth.decode(token).map_err(|err| {
|
||||
let data = auth.decode(token).map_err(|err| {
|
||||
warn!("Authentication error: {err}");
|
||||
// Rely on From<AuthError> for ApiError impl
|
||||
err
|
||||
|
||||
@@ -4,8 +4,6 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use hyper0::Body;
|
||||
use hyper0::server::conn::Http;
|
||||
use metrics::{IntCounterVec, register_int_counter_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::{RequestService, RequestServiceBuilder};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
@@ -28,24 +26,6 @@ pub struct Server {
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
}
|
||||
|
||||
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_started_total",
|
||||
"Number of established http/https connections",
|
||||
&["scheme"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_errors_total",
|
||||
"Number of occured connection errors by type",
|
||||
&["type"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl Server {
|
||||
pub fn new(
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
@@ -80,15 +60,6 @@ impl Server {
|
||||
false
|
||||
}
|
||||
|
||||
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
|
||||
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
|
||||
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
|
||||
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
|
||||
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
|
||||
|
||||
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
|
||||
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
|
||||
|
||||
let mut connections = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -96,7 +67,6 @@ impl Server {
|
||||
let (tcp_stream, remote_addr) = match stream {
|
||||
Ok(stream) => stream,
|
||||
Err(err) => {
|
||||
tcp_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TCP connection: {err:#}");
|
||||
}
|
||||
@@ -108,18 +78,11 @@ impl Server {
|
||||
let tls_acceptor = self.tls_acceptor.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
let tls_error_cnt = tls_error_cnt.clone();
|
||||
let http_error_cnt = http_error_cnt.clone();
|
||||
let https_error_cnt = https_error_cnt.clone();
|
||||
let http_connection_cnt = http_connection_cnt.clone();
|
||||
let https_connection_cnt = https_connection_cnt.clone();
|
||||
|
||||
connections.push(tokio::spawn(
|
||||
async move {
|
||||
match tls_acceptor {
|
||||
Some(tls_acceptor) => {
|
||||
// Handle HTTPS connection.
|
||||
https_connection_cnt.inc();
|
||||
let tls_stream = tokio::select! {
|
||||
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
|
||||
_ = cancel.cancelled() => return,
|
||||
@@ -127,7 +90,6 @@ impl Server {
|
||||
let tls_stream = match tls_stream {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
tls_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
@@ -135,7 +97,6 @@ impl Server {
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
https_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
@@ -143,9 +104,7 @@ impl Server {
|
||||
}
|
||||
None => {
|
||||
// Handle HTTP connection.
|
||||
http_connection_cnt.inc();
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
http_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
@@ -156,7 +115,6 @@ impl Server {
|
||||
}
|
||||
Some(conn) = connections.next() => {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
@@ -164,7 +122,6 @@ impl Server {
|
||||
// Wait for graceful shutdown of all connections.
|
||||
while let Some(conn) = connections.next().await {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,14 +3,11 @@ use std::{sync::Arc, time::Duration};
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use rustls::{
|
||||
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
server::{ClientHello, ResolvesServerCert},
|
||||
sign::CertifiedKey,
|
||||
};
|
||||
use x509_cert::der::Reader;
|
||||
|
||||
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
|
||||
let cert_data = tokio::fs::read(filename)
|
||||
@@ -56,76 +53,6 @@ pub async fn load_certified_key(
|
||||
Ok(certified_key)
|
||||
}
|
||||
|
||||
/// rustls's CertifiedKey with extra parsed fields used for metrics.
|
||||
struct ParsedCertifiedKey {
|
||||
certified_key: CertifiedKey,
|
||||
expiration_time: UnixTime,
|
||||
}
|
||||
|
||||
/// Parse expiration time from an X509 certificate.
|
||||
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
|
||||
let parsed_cert = x509_cert::der::SliceReader::new(cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
Ok(UnixTime::since_unix_epoch(
|
||||
parsed_cert
|
||||
.tbs_certificate
|
||||
.validity
|
||||
.not_after
|
||||
.to_unix_duration(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn load_and_parse_certified_key(
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
) -> anyhow::Result<ParsedCertifiedKey> {
|
||||
let certified_key = load_certified_key(key_filename, cert_filename).await?;
|
||||
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
|
||||
Ok(ParsedCertifiedKey {
|
||||
certified_key,
|
||||
expiration_time,
|
||||
})
|
||||
}
|
||||
|
||||
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"tls_certs_expiration_time_seconds",
|
||||
"Expiration time of the loaded certificate since unix epoch in seconds",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_started_total",
|
||||
"Number of certificate reload loop iterations started",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_updated_total",
|
||||
"Number of times the certificate was updated to the new one",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_failed_total",
|
||||
"Number of times the certificate reload failed",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
|
||||
/// the disk periodically.
|
||||
#[derive(Debug)]
|
||||
@@ -136,28 +63,16 @@ pub struct ReloadingCertificateResolver {
|
||||
impl ReloadingCertificateResolver {
|
||||
/// Creates a new Resolver by loading certificate and private key from FS and
|
||||
/// creating tokio::task to reload them with provided reload_period.
|
||||
/// resolver_name is used as metric's label.
|
||||
pub async fn new(
|
||||
resolver_name: &str,
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
reload_period: Duration,
|
||||
) -> anyhow::Result<Arc<Self>> {
|
||||
// Create metrics for current resolver.
|
||||
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
|
||||
let cert_reload_started_counter =
|
||||
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_updated_counter =
|
||||
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_failed_counter =
|
||||
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
|
||||
|
||||
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
|
||||
|
||||
let this = Arc::new(Self {
|
||||
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
|
||||
certified_key: ArcSwap::from_pointee(
|
||||
load_certified_key(key_filename, cert_filename).await?,
|
||||
),
|
||||
});
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
|
||||
tokio::spawn({
|
||||
let weak_this = Arc::downgrade(&this);
|
||||
@@ -173,22 +88,17 @@ impl ReloadingCertificateResolver {
|
||||
Some(this) => this,
|
||||
None => break, // Resolver has been destroyed, exit.
|
||||
};
|
||||
cert_reload_started_counter.inc();
|
||||
|
||||
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(parsed_key) => {
|
||||
if parsed_key.certified_key.cert == this.certified_key.load().cert {
|
||||
match load_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(new_certified_key) => {
|
||||
if new_certified_key.cert == this.certified_key.load().cert {
|
||||
tracing::debug!("Certificate has not changed since last reloading");
|
||||
} else {
|
||||
tracing::info!("Certificate has been reloaded");
|
||||
this.certified_key.store(Arc::new(parsed_key.certified_key));
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
cert_reload_updated_counter.inc();
|
||||
this.certified_key.store(Arc::new(new_certified_key));
|
||||
}
|
||||
last_reload_failed = false;
|
||||
}
|
||||
Err(err) => {
|
||||
cert_reload_failed_counter.inc();
|
||||
// Note: Reloading certs may fail if it conflicts with the script updating
|
||||
// the files at the same time. Warn only if the error is persistent.
|
||||
if last_reload_failed {
|
||||
|
||||
@@ -35,7 +35,6 @@ nix = {workspace = true, optional = true}
|
||||
reqwest.workspace = true
|
||||
rand.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
|
||||
@@ -180,7 +180,6 @@ pub struct ConfigToml {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generate_unarchival_heatmap: Option<bool>,
|
||||
pub tracing: Option<Tracing>,
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -207,10 +206,6 @@ pub struct PageServicePipeliningConfigPipelined {
|
||||
/// Causes runtime errors if larger than max get_vectored batch size.
|
||||
pub max_batch_size: NonZeroUsize,
|
||||
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
|
||||
// The default below is such that new versions of the software can start
|
||||
// with the old configuration.
|
||||
#[serde(default)]
|
||||
pub batching: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -220,19 +215,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
|
||||
Tasks,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum PageServiceProtocolPipelinedBatchingStrategy {
|
||||
/// All get page requests in a batch will be at the same LSN
|
||||
#[default]
|
||||
UniformLsn,
|
||||
/// Get page requests in a batch may be at different LSN
|
||||
///
|
||||
/// One key cannot be present more than once at different LSNs in
|
||||
/// the same batch.
|
||||
ScatteredLsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case")]
|
||||
pub enum GetVectoredConcurrentIo {
|
||||
@@ -379,8 +361,6 @@ pub struct TenantConfigToml {
|
||||
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
|
||||
pub compaction_upper_limit: usize,
|
||||
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
|
||||
/// If true, enable shard ancestor compaction (enabled by default).
|
||||
pub compaction_shard_ancestor: bool,
|
||||
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
|
||||
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
|
||||
pub compaction_l0_first: bool,
|
||||
@@ -471,8 +451,6 @@ pub struct TenantConfigToml {
|
||||
// gc-compaction related configs
|
||||
/// Enable automatic gc-compaction trigger on this tenant.
|
||||
pub gc_compaction_enabled: bool,
|
||||
/// Enable verification of gc-compaction results.
|
||||
pub gc_compaction_verification: bool,
|
||||
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
|
||||
/// gc-compaction will be triggered.
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
@@ -634,12 +612,9 @@ impl Default for ConfigToml {
|
||||
page_service_pipelining: if !cfg!(test) {
|
||||
PageServicePipeliningConfig::Serial
|
||||
} else {
|
||||
// Do not turn this into the default until scattered reads have been
|
||||
// validated and rolled-out fully.
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
|
||||
batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
|
||||
})
|
||||
},
|
||||
get_vectored_concurrent_io: if !cfg!(test) {
|
||||
@@ -656,7 +631,6 @@ impl Default for ConfigToml {
|
||||
load_previous_heatmap: None,
|
||||
generate_unarchival_heatmap: None,
|
||||
tracing: None,
|
||||
enable_tls_page_service_api: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -679,7 +653,6 @@ pub mod tenant_conf_defaults {
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;
|
||||
|
||||
// This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
|
||||
// 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
|
||||
@@ -717,7 +690,6 @@ pub mod tenant_conf_defaults {
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
|
||||
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
|
||||
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
|
||||
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
|
||||
}
|
||||
@@ -737,7 +709,6 @@ impl Default for TenantConfigToml {
|
||||
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
|
||||
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
|
||||
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
|
||||
l0_flush_delay_threshold: None,
|
||||
@@ -773,7 +744,6 @@ impl Default for TenantConfigToml {
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
|
||||
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
|
||||
sampling_ratio: None,
|
||||
|
||||
@@ -7,8 +7,7 @@ use std::time::{Duration, Instant};
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`storage_controller::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
@@ -500,15 +499,6 @@ pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Import request for safekeeper timelines.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineImportRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub start_lsn: Lsn,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json;
|
||||
|
||||
@@ -927,7 +927,7 @@ impl Key {
|
||||
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
@@ -938,7 +938,7 @@ impl Key {
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => return Err(ToRelBlockError(self.field1)),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -951,17 +951,6 @@ impl std::str::FromStr for Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ToRelBlockError(u8);
|
||||
|
||||
impl fmt::Display for ToRelBlockError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "unexpected value kind 0x{:02x}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ToRelBlockError {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -526,8 +526,6 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_shard_ancestor: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_l0_first: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_l0_semaphore: FieldPatch<bool>,
|
||||
@@ -578,8 +576,6 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_verification: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_ratio_percent: FieldPatch<u64>,
|
||||
@@ -617,9 +613,6 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub compaction_shard_ancestor: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub compaction_l0_first: Option<bool>,
|
||||
|
||||
@@ -703,9 +696,6 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_enabled: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_verification: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_initial_threshold_kb: Option<u64>,
|
||||
|
||||
@@ -729,7 +719,6 @@ impl TenantConfig {
|
||||
mut compaction_threshold,
|
||||
mut compaction_upper_limit,
|
||||
mut compaction_algorithm,
|
||||
mut compaction_shard_ancestor,
|
||||
mut compaction_l0_first,
|
||||
mut compaction_l0_semaphore,
|
||||
mut l0_flush_delay_threshold,
|
||||
@@ -755,7 +744,6 @@ impl TenantConfig {
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
mut sampling_ratio,
|
||||
@@ -778,9 +766,6 @@ impl TenantConfig {
|
||||
.compaction_upper_limit
|
||||
.apply(&mut compaction_upper_limit);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch
|
||||
.compaction_shard_ancestor
|
||||
.apply(&mut compaction_shard_ancestor);
|
||||
patch.compaction_l0_first.apply(&mut compaction_l0_first);
|
||||
patch
|
||||
.compaction_l0_semaphore
|
||||
@@ -850,9 +835,6 @@ impl TenantConfig {
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
.apply(&mut gc_compaction_enabled);
|
||||
patch
|
||||
.gc_compaction_verification
|
||||
.apply(&mut gc_compaction_verification);
|
||||
patch
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.apply(&mut gc_compaction_initial_threshold_kb);
|
||||
@@ -869,7 +851,6 @@ impl TenantConfig {
|
||||
compaction_threshold,
|
||||
compaction_upper_limit,
|
||||
compaction_algorithm,
|
||||
compaction_shard_ancestor,
|
||||
compaction_l0_first,
|
||||
compaction_l0_semaphore,
|
||||
l0_flush_delay_threshold,
|
||||
@@ -895,7 +876,6 @@ impl TenantConfig {
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
sampling_ratio,
|
||||
@@ -930,9 +910,6 @@ impl TenantConfig {
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
compaction_shard_ancestor: self
|
||||
.compaction_shard_ancestor
|
||||
.unwrap_or(global_conf.compaction_shard_ancestor),
|
||||
compaction_l0_first: self
|
||||
.compaction_l0_first
|
||||
.unwrap_or(global_conf.compaction_l0_first),
|
||||
@@ -997,9 +974,6 @@ impl TenantConfig {
|
||||
gc_compaction_enabled: self
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(global_conf.gc_compaction_enabled),
|
||||
gc_compaction_verification: self
|
||||
.gc_compaction_verification
|
||||
.unwrap_or(global_conf.gc_compaction_verification),
|
||||
gc_compaction_initial_threshold_kb: self
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
|
||||
@@ -1817,34 +1791,8 @@ pub mod virtual_file {
|
||||
}
|
||||
|
||||
impl IoMode {
|
||||
pub fn preferred() -> Self {
|
||||
// The default behavior when running Rust unit tests without any further
|
||||
// flags is to use the newest behavior if available on the platform (Direct).
|
||||
// The CI uses the following environment variable to unit tests for all
|
||||
// different modes.
|
||||
// NB: the Python regression & perf tests have their own defaults management
|
||||
// that writes pageserver.toml; they do not use this variable.
|
||||
if cfg!(test) {
|
||||
use once_cell::sync::Lazy;
|
||||
static CACHED: Lazy<IoMode> = Lazy::new(|| {
|
||||
utils::env::var_serde_json_string(
|
||||
"NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
|
||||
)
|
||||
.unwrap_or({
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
IoMode::Direct
|
||||
}
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
IoMode::Buffered
|
||||
}
|
||||
})
|
||||
});
|
||||
*CACHED
|
||||
} else {
|
||||
IoMode::Buffered
|
||||
}
|
||||
pub const fn preferred() -> Self {
|
||||
Self::Buffered
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
[package]
|
||||
name = "remote_keys"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand.workspace = true
|
||||
@@ -1,42 +0,0 @@
|
||||
//! A module that provides a KMS implementation that generates and unwraps the keys.
|
||||
//!
|
||||
|
||||
/// A KMS implementation that does static wrapping and unwrapping of the keys.
|
||||
pub struct NaiveKms {
|
||||
account_id: String,
|
||||
}
|
||||
|
||||
impl NaiveKms {
|
||||
pub fn new(account_id: String) -> Self {
|
||||
Self { account_id }
|
||||
}
|
||||
|
||||
pub fn encrypt(&self, plain: &[u8]) -> anyhow::Result<Vec<u8>> {
|
||||
let wrapped = [self.account_id.as_bytes(), "-wrapped-".as_bytes(), plain].concat();
|
||||
Ok(wrapped)
|
||||
}
|
||||
|
||||
pub fn decrypt(&self, wrapped: &[u8]) -> anyhow::Result<Vec<u8>> {
|
||||
let Some(wrapped) = wrapped.strip_prefix(self.account_id.as_bytes()) else {
|
||||
return Err(anyhow::anyhow!("invalid key"));
|
||||
};
|
||||
let Some(plain) = wrapped.strip_prefix(b"-wrapped-") else {
|
||||
return Err(anyhow::anyhow!("invalid key"));
|
||||
};
|
||||
Ok(plain.to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_generate_key() {
|
||||
let kms = NaiveKms::new("test-tenant".to_string());
|
||||
let data = rand::random::<[u8; 32]>().to_vec();
|
||||
let encrypted = kms.encrypt(&data).unwrap();
|
||||
let decrypted = kms.decrypt(&encrypted).unwrap();
|
||||
assert_eq!(data, decrypted);
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,6 @@ aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
humantime-serde.workspace = true
|
||||
@@ -28,7 +27,6 @@ tokio-util = { workspace = true, features = ["compat"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
md5.workspace = true
|
||||
metrics.workspace = true
|
||||
utils = { path = "../utils", default-features = false }
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -550,19 +550,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
self.download_for_builder(builder, timeout, cancel).await
|
||||
}
|
||||
|
||||
#[allow(unused_variables)]
|
||||
async fn upload_with_encryption(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_objects(std::array::from_ref(path), cancel)
|
||||
.await
|
||||
|
||||
@@ -190,8 +190,6 @@ pub struct DownloadOpts {
|
||||
/// timeouts: for something like an index/manifest/heatmap, we should time out faster than
|
||||
/// for layer files
|
||||
pub kind: DownloadKind,
|
||||
/// The encryption key to use for the download.
|
||||
pub encryption_key: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
pub enum DownloadKind {
|
||||
@@ -206,7 +204,6 @@ impl Default for DownloadOpts {
|
||||
byte_start: Bound::Unbounded,
|
||||
byte_end: Bound::Unbounded,
|
||||
kind: DownloadKind::Large,
|
||||
encryption_key: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,15 +241,6 @@ impl DownloadOpts {
|
||||
None => format!("bytes={start}-"),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn with_encryption_key(mut self, encryption_key: Option<impl AsRef<[u8]>>) -> Self {
|
||||
self.encryption_key = encryption_key.map(|k| k.as_ref().to_vec());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn encryption_key(&self) -> Option<&[u8]> {
|
||||
self.encryption_key.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -343,19 +331,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
/// Same as upload, but with remote encryption if the backend supports it (e.g. SSE-C on AWS).
|
||||
async fn upload_with_encryption(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Delete a single path from remote storage.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
@@ -640,63 +615,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload_with_encryption(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.upload_with_encryption(
|
||||
from,
|
||||
data_size_bytes,
|
||||
to,
|
||||
metadata,
|
||||
encryption_key,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.upload_with_encryption(
|
||||
from,
|
||||
data_size_bytes,
|
||||
to,
|
||||
metadata,
|
||||
encryption_key,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.upload_with_encryption(
|
||||
from,
|
||||
data_size_bytes,
|
||||
to,
|
||||
metadata,
|
||||
encryption_key,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.upload_with_encryption(
|
||||
from,
|
||||
data_size_bytes,
|
||||
to,
|
||||
metadata,
|
||||
encryption_key,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
|
||||
@@ -198,10 +198,6 @@ impl LocalFs {
|
||||
let mut entries = cur_folder.read_dir_utf8()?;
|
||||
while let Some(Ok(entry)) = entries.next() {
|
||||
let file_name = entry.file_name();
|
||||
if file_name.ends_with(".metadata") || file_name.ends_with(".enc") {
|
||||
// ignore metadata and encryption key files
|
||||
continue;
|
||||
}
|
||||
let full_file_name = cur_folder.join(file_name);
|
||||
if full_file_name.as_str().starts_with(prefix) {
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
@@ -222,7 +218,6 @@ impl LocalFs {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
enctyption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = to.with_base(&self.storage_root);
|
||||
@@ -311,8 +306,6 @@ impl LocalFs {
|
||||
)
|
||||
})?;
|
||||
|
||||
// TODO: we might need to make the following writes atomic with the file write operation above
|
||||
|
||||
if let Some(storage_metadata) = metadata {
|
||||
// FIXME: we must not be using metadata much, since this would forget the old metadata
|
||||
// for new writes? or perhaps metadata is sticky; could consider removing if it's never
|
||||
@@ -331,15 +324,6 @@ impl LocalFs {
|
||||
})?;
|
||||
}
|
||||
|
||||
if let Some(encryption_key) = enctyption_key {
|
||||
let encryption_key_path = storage_encryption_key_path(&target_file_path);
|
||||
fs::write(&encryption_key_path, encryption_key).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to write encryption key to the local storage at '{encryption_key_path}'",
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -466,7 +450,6 @@ impl RemoteStorage for LocalFs {
|
||||
key: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
// TODO: check encryption key
|
||||
let target_file_path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&target_file_path).await?;
|
||||
Ok(ListingObject {
|
||||
@@ -478,14 +461,34 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.upload_with_encryption(data, data_size_bytes, to, metadata, None, cancel)
|
||||
.await
|
||||
let cancel = cancel.child_token();
|
||||
|
||||
let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
|
||||
let mut op = std::pin::pin!(op);
|
||||
|
||||
// race the upload0 to the timeout; if it goes over, do a graceful shutdown
|
||||
let (res, timeout) = tokio::select! {
|
||||
res = &mut op => (res, false),
|
||||
_ = tokio::time::sleep(self.timeout) => {
|
||||
cancel.cancel();
|
||||
(op.await, true)
|
||||
}
|
||||
};
|
||||
|
||||
match res {
|
||||
Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// we caused this cancel (or they happened simultaneously) -- swap it out to
|
||||
// Timeout
|
||||
Err(TimeoutOrCancel::Timeout.into())
|
||||
}
|
||||
res => res,
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
@@ -503,22 +506,6 @@ impl RemoteStorage for LocalFs {
|
||||
return Err(DownloadError::Unmodified);
|
||||
}
|
||||
|
||||
let key = match fs::read(storage_encryption_key_path(&target_path)).await {
|
||||
Ok(key) => Some(key),
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => None,
|
||||
Err(e) => {
|
||||
return Err(DownloadError::Other(
|
||||
anyhow::anyhow!(e).context("cannot read encryption key"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
if key != opts.encryption_key {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"encryption key mismatch"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
@@ -564,53 +551,12 @@ impl RemoteStorage for LocalFs {
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
match fs::remove_file(&file_path).await {
|
||||
Ok(()) => {}
|
||||
Ok(()) => Ok(()),
|
||||
// The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
|
||||
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
||||
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(anyhow::anyhow!(e)),
|
||||
};
|
||||
fs::remove_file(&storage_metadata_path(&file_path))
|
||||
.await
|
||||
.ok();
|
||||
fs::remove_file(&storage_encryption_key_path(&file_path))
|
||||
.await
|
||||
.ok();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(unused_variables)]
|
||||
async fn upload_with_encryption(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = cancel.child_token();
|
||||
|
||||
let op = self.upload0(data, data_size_bytes, to, metadata, encryption_key, &cancel);
|
||||
let mut op = std::pin::pin!(op);
|
||||
|
||||
// race the upload0 to the timeout; if it goes over, do a graceful shutdown
|
||||
let (res, timeout) = tokio::select! {
|
||||
res = &mut op => (res, false),
|
||||
_ = tokio::time::sleep(self.timeout) => {
|
||||
cancel.cancel();
|
||||
(op.await, true)
|
||||
}
|
||||
};
|
||||
|
||||
match res {
|
||||
Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// we caused this cancel (or they happened simultaneously) -- swap it out to
|
||||
// Timeout
|
||||
Err(TimeoutOrCancel::Timeout.into())
|
||||
}
|
||||
res => res,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
|
||||
Err(e) => Err(anyhow::anyhow!(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -645,7 +591,6 @@ impl RemoteStorage for LocalFs {
|
||||
to_path = to_path
|
||||
)
|
||||
})?;
|
||||
// TODO: copy metadata and encryption key
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -664,10 +609,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
}
|
||||
|
||||
fn storage_encryption_key_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(original_path, "enc")
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
|
||||
@@ -66,10 +66,7 @@ struct GetObjectRequest {
|
||||
key: String,
|
||||
etag: Option<String>,
|
||||
range: Option<String>,
|
||||
/// Base64 encoded SSE-C key for server-side encryption.
|
||||
sse_c_key: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
@@ -260,13 +257,6 @@ impl S3Bucket {
|
||||
builder = builder.if_none_match(etag);
|
||||
}
|
||||
|
||||
if let Some(encryption_key) = request.sse_c_key {
|
||||
builder = builder.sse_customer_algorithm("AES256");
|
||||
builder = builder.sse_customer_key(base64::encode(&encryption_key));
|
||||
builder = builder
|
||||
.sse_customer_key_md5(base64::encode(md5::compute(&encryption_key).as_slice()));
|
||||
}
|
||||
|
||||
let get_object = builder.send();
|
||||
|
||||
let get_object = tokio::select! {
|
||||
@@ -703,13 +693,12 @@ impl RemoteStorage for S3Bucket {
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload_with_encryption(
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Put;
|
||||
@@ -720,7 +709,7 @@ impl RemoteStorage for S3Bucket {
|
||||
let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
|
||||
|
||||
let mut upload = self
|
||||
let upload = self
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
@@ -728,17 +717,8 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_metadata(metadata.map(|m| m.0))
|
||||
.set_storage_class(self.upload_storage_class.clone())
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream);
|
||||
|
||||
if let Some(encryption_key) = encryption_key {
|
||||
upload = upload.sse_customer_algorithm("AES256");
|
||||
let base64_key = base64::encode(encryption_key);
|
||||
upload = upload.sse_customer_key(&base64_key);
|
||||
upload = upload
|
||||
.sse_customer_key_md5(base64::encode(md5::compute(encryption_key).as_slice()));
|
||||
}
|
||||
|
||||
let upload = upload.send();
|
||||
.body(bytes_stream)
|
||||
.send();
|
||||
|
||||
let upload = tokio::time::timeout(self.timeout, upload);
|
||||
|
||||
@@ -762,18 +742,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.upload_with_encryption(from, data_size_bytes, to, metadata, None, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
@@ -833,7 +801,6 @@ impl RemoteStorage for S3Bucket {
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
etag: opts.etag.as_ref().map(|e| e.to_string()),
|
||||
range: opts.byte_range_header(),
|
||||
sse_c_key: opts.encryption_key.clone(),
|
||||
},
|
||||
cancel,
|
||||
)
|
||||
|
||||
@@ -178,19 +178,6 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.download(from, opts, cancel).await
|
||||
}
|
||||
|
||||
#[allow(unused_variables)]
|
||||
async fn upload_with_encryption(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
encryption_key: Option<&[u8]>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_inner(path, true, cancel).await
|
||||
}
|
||||
|
||||
@@ -421,7 +421,7 @@ async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel, None).await;
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
let timeout = std::time::Duration::from_secs(5);
|
||||
|
||||
@@ -500,7 +500,7 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel, None).await;
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let stream = ctx
|
||||
@@ -555,7 +555,6 @@ async fn upload_large_enough_file(
|
||||
client: &GenericRemoteStorage,
|
||||
path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
encryption_key: Option<&[u8]>,
|
||||
) -> usize {
|
||||
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
||||
let body = bytes::Bytes::from(vec![0u8; 1024]);
|
||||
@@ -566,54 +565,9 @@ async fn upload_large_enough_file(
|
||||
let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
|
||||
|
||||
client
|
||||
.upload_with_encryption(contents, len, path, None, encryption_key, cancel)
|
||||
.upload(contents, len, path, None, cancel)
|
||||
.await
|
||||
.expect("upload succeeds");
|
||||
|
||||
len
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn encryption_works(ctx: &mut MaybeEnabledStorage) {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let key = rand::random::<[u8; 32]>();
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel, Some(&key)).await;
|
||||
|
||||
{
|
||||
let download = ctx
|
||||
.client
|
||||
.download(
|
||||
&path,
|
||||
&DownloadOpts::default().with_encryption_key(Some(&key)),
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
let vec = download_to_vec(download).await.expect("should succeed");
|
||||
assert_eq!(vec.len(), file_len);
|
||||
}
|
||||
|
||||
{
|
||||
// Download without encryption key should fail
|
||||
let download = ctx
|
||||
.client
|
||||
.download(&path, &DownloadOpts::default(), &cancel)
|
||||
.await;
|
||||
assert!(download.is_err());
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
ctx.client.delete_objects(&[path], &cancel).await.unwrap();
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ futures = { workspace = true }
|
||||
jsonwebtoken.workspace = true
|
||||
nix = { workspace = true, features = ["ioctl"] }
|
||||
once_cell.workspace = true
|
||||
pem.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -11,8 +11,7 @@ use camino::Utf8Path;
|
||||
use jsonwebtoken::{
|
||||
Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
|
||||
};
|
||||
use pem::Pem;
|
||||
use serde::{Deserialize, Serialize, de::DeserializeOwned};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::id::TenantId;
|
||||
|
||||
@@ -74,10 +73,7 @@ impl SwappableJwtAuth {
|
||||
pub fn swap(&self, jwt_auth: JwtAuth) {
|
||||
self.0.swap(Arc::new(jwt_auth));
|
||||
}
|
||||
pub fn decode<D: DeserializeOwned>(
|
||||
&self,
|
||||
token: &str,
|
||||
) -> std::result::Result<TokenData<D>, AuthError> {
|
||||
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||
self.0.load().decode(token)
|
||||
}
|
||||
}
|
||||
@@ -152,10 +148,7 @@ impl JwtAuth {
|
||||
/// The function tries the stored decoding keys in succession,
|
||||
/// and returns the first yielding a successful result.
|
||||
/// If there is no working decoding key, it returns the last error.
|
||||
pub fn decode<D: DeserializeOwned>(
|
||||
&self,
|
||||
token: &str,
|
||||
) -> std::result::Result<TokenData<D>, AuthError> {
|
||||
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||
let mut res = None;
|
||||
for decoding_key in &self.decoding_keys {
|
||||
res = Some(decode(token, decoding_key, &self.validation));
|
||||
@@ -180,8 +173,8 @@ impl std::fmt::Debug for JwtAuth {
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_file<S: Serialize>(claims: &S, pem: &Pem) -> Result<String> {
|
||||
let key = EncodingKey::from_ed_der(pem.contents());
|
||||
pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
|
||||
let key = EncodingKey::from_ed_pem(key_data)?;
|
||||
Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
@@ -195,13 +188,13 @@ mod tests {
|
||||
//
|
||||
// openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
|
||||
// openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
|
||||
const TEST_PUB_KEY_ED25519: &str = r#"
|
||||
const TEST_PUB_KEY_ED25519: &[u8] = br#"
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
|
||||
-----END PUBLIC KEY-----
|
||||
"#;
|
||||
|
||||
const TEST_PRIV_KEY_ED25519: &str = r#"
|
||||
const TEST_PRIV_KEY_ED25519: &[u8] = br#"
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
-----END PRIVATE KEY-----
|
||||
@@ -229,9 +222,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
|
||||
// Check it can be validated with the public key
|
||||
let auth = JwtAuth::new(vec![
|
||||
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
|
||||
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
|
||||
]);
|
||||
let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims;
|
||||
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
|
||||
assert_eq!(claims_from_token, expected_claims);
|
||||
}
|
||||
|
||||
@@ -242,14 +235,13 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
|
||||
let encoded = encode_from_key_file(&claims, &pem).unwrap();
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
|
||||
|
||||
// decode it back
|
||||
let auth = JwtAuth::new(vec![
|
||||
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
|
||||
DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
|
||||
]);
|
||||
let decoded: TokenData<Claims> = auth.decode(&encoded).unwrap();
|
||||
let decoded = auth.decode(&encoded).unwrap();
|
||||
|
||||
assert_eq!(decoded.claims, claims);
|
||||
}
|
||||
|
||||
@@ -10,14 +10,11 @@ default = []
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
|
||||
fuzz-read-path = ["testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
base64.workspace = true
|
||||
bit_field.workspace = true
|
||||
bincode.workspace = true
|
||||
byteorder.workspace = true
|
||||
@@ -36,7 +33,6 @@ humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
md5.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
@@ -83,7 +79,6 @@ postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
remote_keys.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tenant_size_model.workspace = true
|
||||
http-utils.workspace = true
|
||||
|
||||
@@ -126,7 +126,7 @@ async fn ingest(
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
let (_desc, path) = layer
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner())
|
||||
.await?
|
||||
.unwrap();
|
||||
tokio::fs::remove_file(path).await?;
|
||||
|
||||
@@ -45,7 +45,6 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
|
||||
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
|
||||
generation: Generation::Valid(1),
|
||||
file_size: 0,
|
||||
encryption_key: None,
|
||||
};
|
||||
|
||||
// Construct the (initial and uploaded) index with layer0.
|
||||
|
||||
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -353,10 +353,9 @@ where
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(query, self.io_concurrency.clone(), self.ctx)
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use anyhow::{Context, anyhow, bail};
|
||||
use camino::Utf8Path;
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
@@ -79,6 +79,8 @@ fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let dev_mode = arg_matches.get_flag("dev");
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
@@ -99,6 +101,20 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
|
||||
|
||||
if !dev_mode {
|
||||
if matches!(conf.http_auth_type, AuthType::Trust)
|
||||
|| matches!(conf.pg_auth_type, AuthType::Trust)
|
||||
{
|
||||
bail!(
|
||||
"Pageserver refuses to start with HTTP or PostgreSQL API authentication disabled.\n\
|
||||
Run with --dev to allow running without authentication.\n\
|
||||
This is insecure and should only be used in development environments."
|
||||
);
|
||||
}
|
||||
} else {
|
||||
warn!("Starting in dev mode: this may be an insecure configuration.");
|
||||
}
|
||||
|
||||
// Initialize logging.
|
||||
//
|
||||
// It must be initialized before the custom panic hook is installed below.
|
||||
@@ -452,24 +468,6 @@ fn start_pageserver(
|
||||
info!("Using auth for http API: {:#?}", conf.http_auth_type);
|
||||
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
|
||||
|
||||
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
|
||||
{
|
||||
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
Some(Arc::new(server_config))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
@@ -688,11 +686,17 @@ fn start_pageserver(
|
||||
|
||||
let https_task = match https_listener {
|
||||
Some(https_listener) => {
|
||||
let tls_server_config = tls_server_config
|
||||
.clone()
|
||||
.expect("tls_server_config is set earlier if https is enabled");
|
||||
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
|
||||
|
||||
let server =
|
||||
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
|
||||
@@ -748,11 +752,6 @@ fn start_pageserver(
|
||||
tokio::net::TcpListener::from_std(pageserver_listener)
|
||||
.context("create tokio listener")?
|
||||
},
|
||||
if conf.enable_tls_page_service_api {
|
||||
tls_server_config
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
@@ -833,6 +832,12 @@ fn cli() -> Command {
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dev")
|
||||
.long("dev")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in development mode (disables security checks)"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -118,13 +118,13 @@ pub struct PageServerConf {
|
||||
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
|
||||
pub concurrent_tenant_warmup: ConfigurableSemaphore,
|
||||
|
||||
/// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
///
|
||||
/// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
@@ -219,11 +219,6 @@ pub struct PageServerConf {
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
|
||||
pub tracing: Option<pageserver_api::config::Tracing>,
|
||||
|
||||
/// Enable TLS in page service API.
|
||||
/// Does not force TLS: the client negotiates TLS usage during the handshake.
|
||||
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -396,7 +391,6 @@ impl PageServerConf {
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -447,7 +441,6 @@ impl PageServerConf {
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -588,10 +581,10 @@ impl ConfigurableSemaphore {
|
||||
/// Initializse using a non-zero amount of permits.
|
||||
///
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
///
|
||||
/// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
|
||||
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
|
||||
use crate::tenant::{LogicalSizeCalculationCause, Tenant};
|
||||
|
||||
mod disk_cache;
|
||||
mod metrics;
|
||||
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
}
|
||||
|
||||
async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
|
||||
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
|
||||
const CAUSE: LogicalSizeCalculationCause =
|
||||
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
||||
|
||||
|
||||
@@ -175,9 +175,9 @@ impl MetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`TenantShard::remote_size`]
|
||||
/// [`Tenant::remote_size`]
|
||||
///
|
||||
/// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
|
||||
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
|
||||
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
@@ -199,9 +199,9 @@ impl MetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
|
||||
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
|
||||
/// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
|
||||
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
@@ -254,7 +254,7 @@ pub(super) async fn collect_all_metrics(
|
||||
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
|
||||
where
|
||||
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
|
||||
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
|
||||
{
|
||||
let mut current_metrics: Vec<NewRawMetric> = Vec::new();
|
||||
|
||||
@@ -308,7 +308,7 @@ impl TenantSnapshot {
|
||||
///
|
||||
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
|
||||
/// cannot just list timelines here.
|
||||
fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
|
||||
fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
|
||||
TenantSnapshot {
|
||||
resident_size,
|
||||
remote_size: t.remote_size(),
|
||||
|
||||
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::{WalIngest, WalIngestErrorKind};
|
||||
use crate::walingest::WalIngest;
|
||||
|
||||
// Returns checkpoint LSN from controlfile
|
||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||
@@ -157,9 +157,9 @@ async fn import_rel(
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
match e.kind {
|
||||
WalIngestErrorKind::RelationAlreadyExists(rel) => {
|
||||
debug!("Relation {rel} already exists. We must be extending it.")
|
||||
match e {
|
||||
RelationError::AlreadyExists => {
|
||||
debug!("Relation {} already exist. We must be extending it.", rel)
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 17;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
|
||||
@@ -17,7 +17,7 @@ use metrics::{
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -1086,7 +1086,7 @@ pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||
/// like how long it took to load.
|
||||
///
|
||||
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
|
||||
@@ -1714,28 +1714,6 @@ pub enum SmgrQueryType {
|
||||
Test,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
IntoStaticStr,
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
enum_map::Enum,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum GetPageBatchBreakReason {
|
||||
BatchFull,
|
||||
NonBatchableRequest,
|
||||
NonUniformLsn,
|
||||
SamePageAtDifferentLsn,
|
||||
NonUniformTimeline,
|
||||
ExecutorSteal,
|
||||
#[cfg(feature = "testing")]
|
||||
NonUniformKey,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
@@ -1747,8 +1725,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
|
||||
per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
}
|
||||
|
||||
@@ -1882,55 +1858,12 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_page_service_batch_break_reason_global",
|
||||
"Reason for breaking batches of get page requests",
|
||||
&["reason"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
struct GetPageBatchBreakReasonTimelineMetrics {
|
||||
map: EnumMap<GetPageBatchBreakReason, IntCounter>,
|
||||
}
|
||||
|
||||
impl GetPageBatchBreakReasonTimelineMetrics {
|
||||
fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
|
||||
GetPageBatchBreakReasonTimelineMetrics {
|
||||
map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
|
||||
let reason = GetPageBatchBreakReason::from_usize(reason_idx);
|
||||
PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
|
||||
tenant_id,
|
||||
shard_slug,
|
||||
timeline_id,
|
||||
reason.into(),
|
||||
])
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
fn inc(&self, reason: GetPageBatchBreakReason) {
|
||||
self.map[reason].inc()
|
||||
}
|
||||
}
|
||||
|
||||
static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_service_batch_break_reason",
|
||||
"Reason for breaking batches of get page requests",
|
||||
&["tenant_id", "shard_id", "timeline_id", "reason"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_page_service_config_max_batch_size",
|
||||
"Configured maximum batch size for the server-side batching functionality of page_service. \
|
||||
Labels expose more of the configuration parameters.",
|
||||
&["mode", "execution", "batching"]
|
||||
&["mode", "execution"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1938,11 +1871,10 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
|
||||
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
|
||||
let (label_values, value) = match conf {
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching,
|
||||
}) => {
|
||||
let mode = "pipelined";
|
||||
let execution = match execution {
|
||||
@@ -1951,12 +1883,7 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
}
|
||||
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
|
||||
};
|
||||
let batching = match batching {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
|
||||
};
|
||||
|
||||
([mode, execution, batching], max_batch_size.get())
|
||||
([mode, execution], max_batch_size.get())
|
||||
}
|
||||
};
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
|
||||
@@ -2052,15 +1979,6 @@ impl SmgrQueryTimePerTimeline {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_batch_break_reason = std::array::from_fn(|i| {
|
||||
let reason = GetPageBatchBreakReason::from_usize(i);
|
||||
PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
|
||||
.get_metric_with_label_values(&[reason.into()])
|
||||
.unwrap()
|
||||
});
|
||||
let per_timeline_batch_break_reason =
|
||||
GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
|
||||
|
||||
let global_flush_in_progress_micros =
|
||||
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
|
||||
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
|
||||
@@ -2078,8 +1996,6 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
global_batch_break_reason,
|
||||
per_timeline_batch_break_reason,
|
||||
throttling: pagestream_throttle_metrics,
|
||||
}
|
||||
}
|
||||
@@ -2108,16 +2024,9 @@ impl SmgrQueryTimePerTimeline {
|
||||
}
|
||||
|
||||
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
|
||||
pub(crate) fn observe_getpage_batch_start(
|
||||
&self,
|
||||
batch_size: usize,
|
||||
break_reason: GetPageBatchBreakReason,
|
||||
) {
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
|
||||
self.global_batch_break_reason[break_reason.into_usize()].inc();
|
||||
self.per_timeline_batch_break_reason.inc(break_reason);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3483,15 +3392,6 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
|
||||
for reason in GetPageBatchBreakReason::iter() {
|
||||
let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
reason.into(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4370,7 +4270,6 @@ pub fn preinitialize_metrics(
|
||||
[
|
||||
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
|
||||
&SMGR_QUERY_STARTED_GLOBAL,
|
||||
&PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
|
||||
@@ -15,11 +15,10 @@ use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::{
|
||||
@@ -59,8 +58,8 @@ use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
|
||||
TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::{
|
||||
@@ -76,7 +75,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
|
||||
use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
|
||||
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
|
||||
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
|
||||
/// is not yet in state [`TenantState::Active`].
|
||||
///
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
@@ -106,7 +105,6 @@ pub fn spawn(
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -126,7 +124,6 @@ pub fn spawn(
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
@@ -184,7 +181,6 @@ pub async fn libpq_listener_main(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
@@ -227,7 +223,6 @@ pub async fn libpq_listener_main(
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
@@ -269,7 +264,6 @@ async fn page_service_conn_main(
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
@@ -340,8 +334,7 @@ async fn page_service_conn_main(
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
);
|
||||
let pgbackend =
|
||||
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
|
||||
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend.run(&mut conn_handler, &cancel).await {
|
||||
Ok(()) => {
|
||||
@@ -642,7 +635,6 @@ impl std::fmt::Display for BatchedPageStreamError {
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
effective_request_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
}
|
||||
|
||||
@@ -672,8 +664,8 @@ enum BatchedFeMessage {
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
batch_break_reason: GetPageBatchBreakReason,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
@@ -726,119 +718,6 @@ impl BatchedFeMessage {
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn should_break_batch(
|
||||
&self,
|
||||
other: &BatchedFeMessage,
|
||||
max_batch_size: NonZeroUsize,
|
||||
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
) -> Option<GetPageBatchBreakReason> {
|
||||
match (self, other) {
|
||||
(
|
||||
BatchedFeMessage::GetPage {
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::GetPage {
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
|
||||
return Some(GetPageBatchBreakReason::BatchFull);
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
|
||||
return Some(GetPageBatchBreakReason::NonUniformTimeline);
|
||||
}
|
||||
|
||||
match batching_strategy {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
|
||||
if let Some(last_in_batch) = accum_pages.last() {
|
||||
if last_in_batch.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
{
|
||||
trace!(
|
||||
accum_lsn = %last_in_batch.effective_request_lsn,
|
||||
this_lsn = %this_pages[0].effective_request_lsn,
|
||||
"stopping batching because LSN changed"
|
||||
);
|
||||
|
||||
return Some(GetPageBatchBreakReason::NonUniformLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
|
||||
// The read path doesn't curently support serving the same page at different LSNs.
|
||||
// While technically possible, it's uncertain if the complexity is worth it.
|
||||
// Break the batch if such a case is encountered.
|
||||
let same_page_different_lsn = accum_pages.iter().any(|batched| {
|
||||
batched.req.rel == this_pages[0].req.rel
|
||||
&& batched.req.blkno == this_pages[0].req.blkno
|
||||
&& batched.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
});
|
||||
|
||||
if same_page_different_lsn {
|
||||
trace!(
|
||||
rel=%this_pages[0].req.rel,
|
||||
blkno=%this_pages[0].req.blkno,
|
||||
lsn=%this_pages[0].effective_request_lsn,
|
||||
"stopping batching because same page was requested at different LSNs"
|
||||
);
|
||||
|
||||
return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return Some(GetPageBatchBreakReason::BatchFull);
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return Some(GetPageBatchBreakReason::NonUniformTimeline);
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return Some(GetPageBatchBreakReason::NonUniformKey);
|
||||
}
|
||||
None
|
||||
}
|
||||
(_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
@@ -1140,32 +1019,34 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_request_lsn = match Self::effective_request_lsn(
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
let res = Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
shard.get_last_record_lsn(),
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
) {
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
let effective_request_lsn = match res {
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
effective_request_lsn,
|
||||
ctx,
|
||||
}],
|
||||
// The executor grabs the batch when it becomes idle.
|
||||
// Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
|
||||
// default reason for breaking the batch.
|
||||
batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -1191,7 +1072,6 @@ impl PageServerHandler {
|
||||
#[instrument(skip_all, level = tracing::Level::TRACE)]
|
||||
#[allow(clippy::boxed_local)]
|
||||
fn pagestream_do_batch(
|
||||
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
max_batch_size: NonZeroUsize,
|
||||
batch: &mut Result<BatchedFeMessage, QueryError>,
|
||||
this_msg: Result<BatchedFeMessage, QueryError>,
|
||||
@@ -1203,58 +1083,89 @@ impl PageServerHandler {
|
||||
Err(e) => return Err(Err(e)),
|
||||
};
|
||||
|
||||
let eligible_batch = match batch {
|
||||
Ok(b) => b,
|
||||
Err(_) => {
|
||||
return Err(Ok(this_msg));
|
||||
}
|
||||
};
|
||||
|
||||
let batch_break =
|
||||
eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
|
||||
|
||||
match batch_break {
|
||||
Some(reason) => {
|
||||
if let BatchedFeMessage::GetPage {
|
||||
batch_break_reason, ..
|
||||
} = eligible_batch
|
||||
{
|
||||
*batch_break_reason = reason;
|
||||
match (&mut *batch, this_msg) {
|
||||
// something batched already, let's see if we can add this message to the batch
|
||||
(
|
||||
Ok(BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
effective_request_lsn: accum_lsn,
|
||||
}),
|
||||
BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
effective_request_lsn: this_lsn,
|
||||
},
|
||||
) if (|| {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= max_batch_size.get() {
|
||||
trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
|
||||
Err(Ok(this_msg))
|
||||
}
|
||||
None => {
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
// the vectored get currently only supports a single LSN, so, bounce as soon
|
||||
// as the effective request_lsn changes
|
||||
if *accum_lsn != this_lsn {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
match (eligible_batch, this_msg) {
|
||||
(
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: accum_pages, ..
|
||||
},
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: this_pages, ..
|
||||
},
|
||||
) => {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
BatchedFeMessage::Test {
|
||||
requests: accum_requests,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::Test {
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
|
||||
_ => unreachable!(),
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
Err(Ok(this_msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1476,8 +1387,8 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
batch_break_reason,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
@@ -1488,9 +1399,9 @@ impl PageServerHandler {
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
io_concurrency,
|
||||
batch_break_reason,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
@@ -1807,7 +1718,6 @@ impl PageServerHandler {
|
||||
let PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching: batching_strategy,
|
||||
} = pipelining_config;
|
||||
|
||||
// Macro to _define_ a pipeline stage.
|
||||
@@ -1859,7 +1769,7 @@ impl PageServerHandler {
|
||||
exit |= read_res.is_err();
|
||||
let could_send = batch_tx
|
||||
.send(read_res, |batch, res| {
|
||||
Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
|
||||
Self::pagestream_do_batch(max_batch_size, batch, res)
|
||||
})
|
||||
.await;
|
||||
exit |= could_send.is_err();
|
||||
@@ -1955,39 +1865,7 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let effective_request_lsn = Self::effective_request_lsn(
|
||||
timeline,
|
||||
last_record_lsn,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
latest_gc_cutoff_lsn,
|
||||
)?;
|
||||
|
||||
if effective_request_lsn > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Since we waited for 'effective_request_lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
}
|
||||
|
||||
Ok(effective_request_lsn)
|
||||
}
|
||||
|
||||
fn effective_request_lsn(
|
||||
timeline: &Timeline,
|
||||
last_record_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
// Sanity check the request
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
@@ -2022,7 +1900,19 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
if not_modified_since > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// Since we waited for 'not_modified_since' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
Ok(not_modified_since)
|
||||
} else {
|
||||
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
|
||||
@@ -2177,16 +2067,16 @@ impl PageServerHandler {
|
||||
async fn handle_get_page_at_lsn_request_batched(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
io_concurrency: IoConcurrency,
|
||||
batch_break_reason: GetPageBatchBreakReason,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
timeline
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len(), batch_break_reason);
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
@@ -2196,81 +2086,20 @@ impl PageServerHandler {
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn: batch.effective_request_lsn,
|
||||
effective_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If any request in the batch needs to wait for LSN, then do so now.
|
||||
let mut perf_instrument = false;
|
||||
let max_effective_lsn = requests
|
||||
.iter()
|
||||
.map(|req| {
|
||||
if req.ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
req.effective_request_lsn
|
||||
})
|
||||
.max()
|
||||
.expect("batch is never empty");
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id = %timeline.timeline_id,
|
||||
shard = %timeline.tenant_shard_id.shard_slug(),
|
||||
%max_effective_lsn
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if max_effective_lsn > last_record_lsn {
|
||||
if let Err(e) = timeline
|
||||
.wait_lsn(
|
||||
max_effective_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await
|
||||
{
|
||||
return Vec::from_iter(requests.into_iter().map(|req| {
|
||||
Err(BatchedPageStreamError {
|
||||
err: PageStreamError::from(e.clone()),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| {
|
||||
(
|
||||
&p.req.rel,
|
||||
&p.req.blkno,
|
||||
p.effective_request_lsn,
|
||||
p.ctx.attached_child(),
|
||||
)
|
||||
}),
|
||||
requests
|
||||
.iter()
|
||||
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
|
||||
effective_lsn,
|
||||
io_concurrency,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(results.len(), requests.len());
|
||||
@@ -2838,7 +2667,7 @@ where
|
||||
) -> Result<(), QueryError> {
|
||||
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
|
||||
// which requires auth to be present
|
||||
let data: TokenData<Claims> = self
|
||||
let data = self
|
||||
.auth
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
|
||||
@@ -6,14 +6,14 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use std::collections::{HashMap, HashSet, hash_map};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
|
||||
use std::ops::{ControlFlow, Range};
|
||||
|
||||
use crate::walingest::{WalIngestError, WalIngestErrorKind};
|
||||
use crate::{PERF_TRACE_TARGET, ensure_walingest};
|
||||
use anyhow::Context;
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, ensure};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
|
||||
TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
|
||||
@@ -21,7 +21,7 @@ use pageserver_api::key::{
|
||||
repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
};
|
||||
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext};
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
@@ -50,7 +50,7 @@ use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -136,8 +136,12 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
AlreadyExists,
|
||||
#[error("invalid relnode")]
|
||||
InvalidRelnode,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
@@ -206,9 +210,10 @@ impl Timeline {
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages.iter().map(|(tag, blknum)| {
|
||||
(tag, blknum, effective_lsn, ctx.attached_child())
|
||||
}),
|
||||
pages
|
||||
.iter()
|
||||
.map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
|
||||
effective_lsn,
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
@@ -246,7 +251,8 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
|
||||
effective_lsn: Lsn,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
@@ -259,13 +265,11 @@ impl Timeline {
|
||||
let mut result = Vec::with_capacity(pages.len());
|
||||
let result_slots = result.spare_capacity_mut();
|
||||
|
||||
let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
BTreeMap::default();
|
||||
|
||||
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
|
||||
let mut perf_instrument = false;
|
||||
for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -276,14 +280,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let nblocks = match self
|
||||
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
|
||||
.get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"GET_REL_SIZE",
|
||||
reltag=%tag,
|
||||
lsn=%lsn,
|
||||
lsn=%effective_lsn,
|
||||
)
|
||||
})
|
||||
.await
|
||||
@@ -299,7 +303,7 @@ impl Timeline {
|
||||
if *blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, lsn, nblocks
|
||||
tag, blknum, effective_lsn, nblocks
|
||||
);
|
||||
result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
|
||||
slots_filled += 1;
|
||||
@@ -308,29 +312,46 @@ impl Timeline {
|
||||
|
||||
let key = rel_block_to_key(*tag, *blknum);
|
||||
|
||||
if ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
let key_slots = keys_slots.entry(key).or_default();
|
||||
key_slots.push((response_slot_idx, ctx));
|
||||
|
||||
let acc = req_keyspaces.entry(lsn).or_default();
|
||||
acc.add_key(key);
|
||||
}
|
||||
|
||||
let query: Vec<(Lsn, KeySpace)> = req_keyspaces
|
||||
.into_iter()
|
||||
.map(|(lsn, acc)| (lsn, acc.to_keyspace()))
|
||||
.collect();
|
||||
let keyspace = {
|
||||
// add_key requires monotonicity
|
||||
let mut acc = KeySpaceAccum::new();
|
||||
for key in keys_slots
|
||||
.keys()
|
||||
// in fact it requires strong monotonicity
|
||||
.dedup()
|
||||
{
|
||||
acc.add_key(*key);
|
||||
}
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %self.tenant_shard_id.tenant_id,
|
||||
timeline_id = %self.timeline_id,
|
||||
lsn = %effective_lsn,
|
||||
shard = %self.tenant_shard_id.shard_slug(),
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
|
||||
let query = VersionedKeySpaceQuery::scattered(query);
|
||||
let res = self
|
||||
.get_vectored(query, io_concurrency, ctx)
|
||||
.maybe_perf_instrument(ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"GET_BATCH",
|
||||
batch_size = %page_count,
|
||||
)
|
||||
})
|
||||
.get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
|
||||
.await;
|
||||
|
||||
match res {
|
||||
@@ -360,12 +381,12 @@ impl Timeline {
|
||||
// There is no standardized way to express that the batched span followed from N request spans.
|
||||
// So, abuse the system and mark the request contexts as follows_from the batch span, so we get
|
||||
// some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
|
||||
result_slots[first_slot].write(res);
|
||||
first_req_ctx.perf_follows_from(ctx);
|
||||
first_req_ctx.perf_follows_from(&ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
}
|
||||
@@ -404,7 +425,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
result_slots[*slot].write(err);
|
||||
}
|
||||
|
||||
@@ -643,9 +664,8 @@ impl Timeline {
|
||||
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for batch in batches.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, lsn);
|
||||
let blocks = self
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, block) in blocks {
|
||||
@@ -882,9 +902,8 @@ impl Timeline {
|
||||
);
|
||||
|
||||
for batch in batches.parts.into_iter().rev() {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
|
||||
let blocks = self
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, clog_page) in blocks.into_iter().rev() {
|
||||
@@ -1459,8 +1478,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
@@ -1559,7 +1578,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u32, WalIngestError> {
|
||||
) -> Result<u32, PageReconstructError> {
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
@@ -1574,13 +1593,14 @@ impl DatadirModification<'_> {
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx).await?;
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
Ok(0)
|
||||
} else {
|
||||
Ok(self
|
||||
.tline
|
||||
self.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await?)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1617,14 +1637,11 @@ impl DatadirModification<'_> {
|
||||
// TODO(vlad): remove this argument and replace the shard check with is_key_local
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
let key = Key::from_compact(meta.key());
|
||||
let (rel, blkno) = key
|
||||
.to_rel_block()
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
|
||||
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
|
||||
let new_nblocks = blkno + 1;
|
||||
|
||||
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
|
||||
@@ -1666,8 +1683,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1679,7 +1696,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
if !self.tline.tenant_shard_id.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1697,11 +1714,14 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
}
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1713,12 +1733,15 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
}
|
||||
self.put(key, Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1728,11 +1751,15 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1749,11 +1776,15 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1801,10 +1832,8 @@ impl DatadirModification<'_> {
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
@@ -1845,13 +1874,13 @@ impl DatadirModification<'_> {
|
||||
xid: u64,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1862,7 +1891,7 @@ impl DatadirModification<'_> {
|
||||
let xid = xid as u32;
|
||||
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1880,22 +1909,22 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
origin_id: RepOriginId,
|
||||
origin_lsn: Lsn,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
let key = repl_origin_key(origin_id);
|
||||
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
|
||||
self.set_replorigin(origin_id, Lsn::INVALID).await
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
self.put(CHECKPOINT_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1905,7 +1934,7 @@ impl DatadirModification<'_> {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
|
||||
@@ -1944,21 +1973,20 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> Result<(), RelationError> {
|
||||
if rel.relnode == 0 {
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)))?;
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
|
||||
let dbdir_exists =
|
||||
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
|
||||
// Didn't exist. Update dbdir
|
||||
e.insert(false);
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::Db,
|
||||
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
|
||||
@@ -1975,25 +2003,27 @@ impl DatadirModification<'_> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
};
|
||||
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = self
|
||||
.sparse_get(sparse_rel_dir_key, ctx)
|
||||
.await
|
||||
.map_err(|e| RelationError::Other(e.into()))?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
if val == RelDirExists::Exists {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
@@ -2009,7 +2039,9 @@ impl DatadirModification<'_> {
|
||||
// will be key not found errors if we don't create an empty one for rel_size_v2.
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
@@ -2017,7 +2049,7 @@ impl DatadirModification<'_> {
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
@@ -2027,7 +2059,9 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&rel_dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2052,8 +2086,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
@@ -2083,8 +2117,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -2108,10 +2142,8 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2131,7 +2163,7 @@ impl DatadirModification<'_> {
|
||||
let key =
|
||||
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
if val == RelDirExists::Exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
@@ -2174,7 +2206,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Add it to the directory entry
|
||||
@@ -2183,7 +2215,7 @@ impl DatadirModification<'_> {
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::SlruSegment(kind),
|
||||
@@ -2210,7 +2242,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Put size
|
||||
@@ -2226,7 +2258,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2251,7 +2283,7 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Drop a relmapper file (pg_filenode.map)
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
|
||||
// TODO
|
||||
Ok(())
|
||||
}
|
||||
@@ -2261,7 +2293,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
xid: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
@@ -2276,8 +2308,7 @@ impl DatadirModification<'_> {
|
||||
));
|
||||
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
|
||||
} else {
|
||||
let xid: u32 = u32::try_from(xid)
|
||||
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
|
||||
let xid: u32 = u32::try_from(xid)?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -2302,7 +2333,7 @@ impl DatadirModification<'_> {
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
) -> anyhow::Result<()> {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
@@ -2311,7 +2342,7 @@ impl DatadirModification<'_> {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -2356,8 +2387,7 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)
|
||||
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
|
||||
Ok(())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,6 @@ use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
@@ -37,63 +36,6 @@ pub struct CompressionInfo {
|
||||
pub compressed_size: Option<usize>,
|
||||
}
|
||||
|
||||
/// A blob header, with header+data length and compression info.
|
||||
///
|
||||
/// TODO: use this more widely, and add an encode() method too.
|
||||
/// TODO: document the header format.
|
||||
#[derive(Clone, Copy, Default)]
|
||||
pub struct Header {
|
||||
pub header_len: usize,
|
||||
pub data_len: usize,
|
||||
pub compression_bits: u8,
|
||||
}
|
||||
|
||||
impl Header {
|
||||
/// Decodes a header from a byte slice.
|
||||
pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
|
||||
let Some(&first_header_byte) = bytes.first() else {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"zero-length blob header",
|
||||
));
|
||||
};
|
||||
|
||||
// If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
|
||||
if first_header_byte < 0x80 {
|
||||
return Ok(Self {
|
||||
header_len: 1, // by definition
|
||||
data_len: first_header_byte as usize,
|
||||
compression_bits: BYTE_UNCOMPRESSED,
|
||||
});
|
||||
}
|
||||
|
||||
// Otherwise, this is a 4-byte header containing compression information and length.
|
||||
const HEADER_LEN: usize = 4;
|
||||
let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("blob header too short: {bytes:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
// TODO: verify the compression bits and convert to an enum.
|
||||
let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
|
||||
header_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
|
||||
let data_len = u32::from_be_bytes(header_buf) as usize;
|
||||
|
||||
Ok(Self {
|
||||
header_len: HEADER_LEN,
|
||||
data_len,
|
||||
compression_bits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the total header+data length.
|
||||
pub fn total_len(&self) -> usize {
|
||||
self.header_len + self.data_len
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockCursor<'_> {
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(
|
||||
@@ -227,13 +169,7 @@ pub struct BlobWriter<const BUFFERED: bool> {
|
||||
}
|
||||
|
||||
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
pub fn new(
|
||||
inner: VirtualFile,
|
||||
start_offset: u64,
|
||||
_gate: &utils::sync::gate::Gate,
|
||||
_cancel: CancellationToken,
|
||||
_ctx: &RequestContext,
|
||||
) -> Self {
|
||||
pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
offset: start_offset,
|
||||
@@ -446,34 +382,6 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
};
|
||||
(srcbuf, res.map(|_| (offset, compression_info)))
|
||||
}
|
||||
|
||||
/// Writes a raw blob containing both header and data, returning its offset.
|
||||
pub(crate) async fn write_blob_raw<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
raw_with_header: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, Result<u64, Error>) {
|
||||
// Verify the header, to ensure we don't write invalid/corrupt data.
|
||||
let header = match Header::decode(&raw_with_header) {
|
||||
Ok(header) => header,
|
||||
Err(err) => return (raw_with_header, Err(err)),
|
||||
};
|
||||
if raw_with_header.len() != header.total_len() {
|
||||
let header_total_len = header.total_len();
|
||||
let raw_len = raw_with_header.len();
|
||||
return (
|
||||
raw_with_header,
|
||||
Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("header length mismatch: {header_total_len} != {raw_len}"),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let offset = self.offset;
|
||||
let (raw_with_header, result) = self.write_all(raw_with_header, ctx).await;
|
||||
(raw_with_header, result.map(|_| offset))
|
||||
}
|
||||
}
|
||||
|
||||
impl BlobWriter<true> {
|
||||
@@ -524,14 +432,12 @@ pub(crate) mod tests {
|
||||
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
|
||||
let temp_dir = camino_tempfile::tempdir()?;
|
||||
let pathbuf = temp_dir.path().join("file");
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Write part (in block to drop the file)
|
||||
let mut offsets = Vec::new();
|
||||
{
|
||||
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
let res = wtr
|
||||
|
||||
@@ -714,7 +714,7 @@ impl LayerMap {
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
|
||||
@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
}
|
||||
|
||||
/// Iterate all the layers
|
||||
pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
|
||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
||||
// NOTE we can actually perform this without rebuilding,
|
||||
// but it's not necessary for now.
|
||||
if !self.buffer.is_empty() {
|
||||
|
||||
@@ -564,9 +564,8 @@ mod tests {
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Updating this version to 17 will cause the test to fail at the
|
||||
// next assert_eq!().
|
||||
16,
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
|
||||
@@ -52,9 +52,7 @@ use crate::tenant::config::{
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{
|
||||
AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
|
||||
};
|
||||
use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -69,7 +67,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
||||
/// having a properly acquired generation (Secondary doesn't need a generation)
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum TenantSlot {
|
||||
Attached(Arc<TenantShard>),
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
/// In this state, other administrative operations acting on the TenantId should
|
||||
/// block, or return a retry indicator equivalent to HTTP 503.
|
||||
@@ -88,7 +86,7 @@ impl std::fmt::Debug for TenantSlot {
|
||||
|
||||
impl TenantSlot {
|
||||
/// Return the `Tenant` in this slot if attached, else None
|
||||
fn get_attached(&self) -> Option<&Arc<TenantShard>> {
|
||||
fn get_attached(&self) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
Self::Attached(t) => Some(t),
|
||||
Self::Secondary(_) => None,
|
||||
@@ -166,7 +164,7 @@ impl TenantStartupMode {
|
||||
/// Result type for looking up a TenantId to a specific shard
|
||||
pub(crate) enum ShardResolveResult {
|
||||
NotFound,
|
||||
Found(Arc<TenantShard>),
|
||||
Found(Arc<Tenant>),
|
||||
// Wait for this barrrier, then query again
|
||||
InProgress(utils::completion::Barrier),
|
||||
}
|
||||
@@ -175,7 +173,7 @@ impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
/// None is returned.
|
||||
pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
|
||||
pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
@@ -412,7 +410,7 @@ fn load_tenant_config(
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
|
||||
Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
|
||||
}
|
||||
|
||||
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
|
||||
@@ -608,8 +606,7 @@ pub async fn init_tenant_mgr(
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
config_write_futs.push(async move {
|
||||
let r =
|
||||
TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
|
||||
let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
|
||||
(tenant_shard_id, location_conf, r)
|
||||
});
|
||||
}
|
||||
@@ -697,7 +694,7 @@ fn tenant_spawn(
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<TenantShard>, GlobalShutDown> {
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||
// to avoid impacting prod runtime performance.
|
||||
@@ -709,7 +706,7 @@ fn tenant_spawn(
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
TenantShard::spawn(
|
||||
Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources,
|
||||
@@ -886,12 +883,12 @@ impl TenantManager {
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
|
||||
/// undergoing a state change (i.e. slot is InProgress).
|
||||
///
|
||||
/// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
|
||||
/// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
|
||||
/// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
|
||||
/// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
|
||||
pub(crate) fn get_attached_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<Arc<TenantShard>, GetTenantError> {
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
@@ -940,12 +937,12 @@ impl TenantManager {
|
||||
flush: Option<Duration>,
|
||||
mut spawn_mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
|
||||
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
info!("configuring tenant location to state {new_location_config:?}");
|
||||
|
||||
enum FastPathModified {
|
||||
Attached(Arc<TenantShard>),
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
}
|
||||
|
||||
@@ -1002,13 +999,9 @@ impl TenantManager {
|
||||
// phase of writing config and/or waiting for flush, before returning.
|
||||
match fast_path_taken {
|
||||
Some(FastPathModified::Attached(tenant)) => {
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
&new_location_config,
|
||||
)
|
||||
.await
|
||||
.fatal_err("write tenant shard config");
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.fatal_err("write tenant shard config");
|
||||
|
||||
// Transition to AttachedStale means we may well hold a valid generation
|
||||
// still, and have been requested to go stale as part of a migration. If
|
||||
@@ -1037,13 +1030,9 @@ impl TenantManager {
|
||||
return Ok(Some(tenant));
|
||||
}
|
||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
&new_location_config,
|
||||
)
|
||||
.await
|
||||
.fatal_err("write tenant shard config");
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.fatal_err("write tenant shard config");
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -1133,7 +1122,7 @@ impl TenantManager {
|
||||
// Before activating either secondary or attached mode, persist the
|
||||
// configuration, so that on restart we will re-attach (or re-start
|
||||
// secondary) on the tenant.
|
||||
TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.fatal_err("write tenant shard config");
|
||||
|
||||
@@ -1273,7 +1262,7 @@ impl TenantManager {
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
if drop_cache {
|
||||
tracing::info!("Dropping local file cache");
|
||||
@@ -1308,7 +1297,7 @@ impl TenantManager {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
|
||||
pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
match &*locked {
|
||||
TenantsMap::Initializing => Vec::new(),
|
||||
@@ -1457,7 +1446,7 @@ impl TenantManager {
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant: Arc<TenantShard>,
|
||||
tenant: Arc<Tenant>,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1487,7 +1476,7 @@ impl TenantManager {
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant: Arc<TenantShard>,
|
||||
tenant: Arc<Tenant>,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1714,7 +1703,7 @@ impl TenantManager {
|
||||
/// For each resident layer in the parent shard, we will hard link it into all of the child shards.
|
||||
async fn shard_split_hardlink(
|
||||
&self,
|
||||
parent_shard: &TenantShard,
|
||||
parent_shard: &Tenant,
|
||||
child_shards: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
@@ -1999,7 +1988,7 @@ impl TenantManager {
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
|
||||
.map_err(|e| Error::DetachReparent(e.into()))?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
|
||||
@@ -133,7 +133,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`TenantShard::timeline_init_and_sync`].
|
||||
//! [`Tenant::timeline_init_and_sync`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -171,7 +171,7 @@
|
||||
//! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
|
||||
//! not created and the uploads are skipped.
|
||||
//!
|
||||
//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
|
||||
pub(crate) mod download;
|
||||
@@ -192,12 +192,11 @@ pub(crate) use download::{
|
||||
download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file,
|
||||
list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
use index::GcCompactionState;
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
use index::{EncryptionKey, EncryptionKeyId, EncryptionKeyPair, GcCompactionState, KeyVersion};
|
||||
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState, TimelineVisibilityState};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use regex::Regex;
|
||||
use remote_keys::NaiveKms;
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
|
||||
};
|
||||
@@ -368,10 +367,6 @@ pub(crate) struct RemoteTimelineClient {
|
||||
config: std::sync::RwLock<RemoteTimelineClientConfig>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
|
||||
kms_impl: Option<NaiveKms>,
|
||||
|
||||
key_repo: std::sync::Mutex<HashMap<EncryptionKeyId, EncryptionKeyPair>>,
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClient {
|
||||
@@ -416,9 +411,6 @@ impl RemoteTimelineClient {
|
||||
)),
|
||||
config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
|
||||
cancel: CancellationToken::new(),
|
||||
// TODO: make this configurable
|
||||
kms_impl: Some(NaiveKms::new(tenant_shard_id.tenant_id.to_string())),
|
||||
key_repo: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -735,43 +727,9 @@ impl RemoteTimelineClient {
|
||||
reason: "no need for a downloads gauge",
|
||||
},
|
||||
);
|
||||
let key_pair = if let Some(ref key_id) = layer_metadata.encryption_key {
|
||||
let wrapped_key = {
|
||||
let mut queue = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = queue.initialized_mut().unwrap();
|
||||
let encryption_key_pair =
|
||||
upload_queue.dirty.keys.iter().find(|key| &key.id == key_id);
|
||||
if let Some(encryption_key_pair) = encryption_key_pair {
|
||||
// TODO: also check if we have uploaded the key yet; we should never use a key that is not persisted
|
||||
encryption_key_pair.clone()
|
||||
} else {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Encryption key pair not found in index_part.json"
|
||||
)));
|
||||
}
|
||||
};
|
||||
let Some(kms) = self.kms_impl.as_ref() else {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"KMS not configured when downloading encrypted layer file"
|
||||
)));
|
||||
};
|
||||
let plain_key = kms
|
||||
.decrypt(&wrapped_key.key)
|
||||
.context("failed to decrypt encryption key")
|
||||
.map_err(DownloadError::Other)?;
|
||||
Some(EncryptionKeyPair::new(
|
||||
wrapped_key.id,
|
||||
plain_key,
|
||||
wrapped_key.key,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
key_pair.as_ref(),
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
@@ -1292,14 +1250,6 @@ impl RemoteTimelineClient {
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
layer: ResidentLayer,
|
||||
) {
|
||||
let key_pair = {
|
||||
if let Some(key_id) = layer.metadata().encryption_key {
|
||||
let guard = self.key_repo.lock().unwrap();
|
||||
Some(guard.get(&key_id).cloned().unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
let metadata = layer.metadata();
|
||||
|
||||
upload_queue
|
||||
@@ -1314,7 +1264,7 @@ impl RemoteTimelineClient {
|
||||
"scheduled layer file upload {layer}",
|
||||
);
|
||||
|
||||
let op = UploadOp::UploadLayer(layer, metadata, key_pair, None);
|
||||
let op = UploadOp::UploadLayer(layer, metadata, None);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
@@ -1496,58 +1446,6 @@ impl RemoteTimelineClient {
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn is_kms_enabled(&self) -> bool {
|
||||
self.kms_impl.is_some()
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_generate_encryption_key(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<Option<EncryptionKeyPair>, NotInitialized> {
|
||||
let Some(kms_impl) = self.kms_impl.as_ref() else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let plain_key = rand::random::<[u8; 32]>().to_vec(); // StdRng is cryptographically secure (?)
|
||||
let wrapped_key = kms_impl.encrypt(&plain_key).unwrap();
|
||||
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
let last_key = upload_queue.dirty.keys.last();
|
||||
let this_key_version = if let Some(last_key) = last_key {
|
||||
let key_version = EncryptionKeyId {
|
||||
version: last_key.id.version.next(),
|
||||
generation: self.generation,
|
||||
};
|
||||
assert!(key_version > last_key.id); // ensure key version is strictly increasing; no dup key versions
|
||||
key_version
|
||||
} else {
|
||||
EncryptionKeyId {
|
||||
version: KeyVersion(1),
|
||||
generation: self.generation,
|
||||
}
|
||||
};
|
||||
|
||||
let key_pair = EncryptionKeyPair {
|
||||
id: this_key_version.clone(),
|
||||
plain_key: plain_key.clone(),
|
||||
wrapped_key,
|
||||
};
|
||||
|
||||
upload_queue.dirty.keys.push(EncryptionKey {
|
||||
key: plain_key,
|
||||
id: this_key_version,
|
||||
created_at: Utc::now().naive_utc(),
|
||||
});
|
||||
|
||||
self.key_repo.lock().unwrap().insert(this_key_version, key_pair);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
Ok(Some(key_pair))
|
||||
}
|
||||
|
||||
/// Schedules a compaction update to the remote `index_part.json`.
|
||||
///
|
||||
/// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
|
||||
@@ -1556,7 +1454,6 @@ impl RemoteTimelineClient {
|
||||
compacted_from: &[Layer],
|
||||
compacted_to: &[ResidentLayer],
|
||||
) -> Result<(), NotInitialized> {
|
||||
// Use the same key for all layers in a single compaction job
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
@@ -1818,7 +1715,6 @@ impl RemoteTimelineClient {
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size,
|
||||
None, // TODO(chi): support encryption for those layer files uploaded using this interface
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
@@ -1861,8 +1757,6 @@ impl RemoteTimelineClient {
|
||||
adopted_as.metadata().generation,
|
||||
);
|
||||
|
||||
// TODO: support encryption for those layer files uploaded using this interface
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::copy_timeline_layer(
|
||||
@@ -2083,7 +1977,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Prepare upload.
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, _, mode) => {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
.recently_deleted
|
||||
.remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
|
||||
@@ -2177,7 +2071,7 @@ impl RemoteTimelineClient {
|
||||
// Assert that we don't modify a layer that's referenced by the current index.
|
||||
if cfg!(debug_assertions) {
|
||||
let modified = match &task.op {
|
||||
UploadOp::UploadLayer(layer, layer_metadata, _, _) => {
|
||||
UploadOp::UploadLayer(layer, layer_metadata, _) => {
|
||||
vec![(layer.layer_desc().layer_name(), layer_metadata)]
|
||||
}
|
||||
UploadOp::Delete(delete) => {
|
||||
@@ -2199,7 +2093,7 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(layer, layer_metadata, encryption_key_pair, mode) => {
|
||||
UploadOp::UploadLayer(layer, layer_metadata, mode) => {
|
||||
// TODO: check if this mechanism can be removed now that can_bypass() performs
|
||||
// conflict checks during scheduling.
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
@@ -2280,7 +2174,6 @@ impl RemoteTimelineClient {
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size,
|
||||
encryption_key_pair.clone(),
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -2431,7 +2324,7 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _, _, _) => None,
|
||||
UploadOp::UploadLayer(_, _, _) => None,
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
@@ -2510,7 +2403,7 @@ impl RemoteTimelineClient {
|
||||
)> {
|
||||
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
|
||||
let res = match op {
|
||||
UploadOp::UploadLayer(_, m, _, _) => (
|
||||
UploadOp::UploadLayer(_, m, _) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
@@ -2850,7 +2743,7 @@ mod tests {
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
format!("contents for {name}").into()
|
||||
@@ -2894,10 +2787,6 @@ mod tests {
|
||||
for entry in std::fs::read_dir(remote_path).unwrap().flatten() {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
if fname.ends_with(".metadata") || fname.ends_with(".enc") {
|
||||
// ignore metadata and encryption key files; should use local_fs APIs instead in the future
|
||||
continue;
|
||||
}
|
||||
found.push(String::from(fname));
|
||||
}
|
||||
found.sort();
|
||||
@@ -2907,7 +2796,7 @@ mod tests {
|
||||
|
||||
struct TestSetup {
|
||||
harness: TenantHarness,
|
||||
tenant: Arc<TenantShard>,
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
tenant_ctx: RequestContext,
|
||||
}
|
||||
@@ -2951,8 +2840,6 @@ mod tests {
|
||||
)),
|
||||
config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
|
||||
cancel: CancellationToken::new(),
|
||||
kms_impl: None,
|
||||
key_repo: std::sync::Mutex::new(HashMap::new()),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
|
||||
use super::index::{EncryptionKeyPair, IndexPart, LayerFileMetadata};
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::manifest::TenantManifest;
|
||||
use super::{
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path,
|
||||
@@ -51,7 +51,6 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error};
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
key_pair: Option<&'a EncryptionKeyPair>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerName,
|
||||
@@ -87,16 +86,7 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async {
|
||||
download_object(
|
||||
storage,
|
||||
key_pair,
|
||||
&remote_path,
|
||||
&temp_file_path,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -155,7 +145,6 @@ pub async fn download_layer_file<'a>(
|
||||
/// The unlinking has _not_ been made durable.
|
||||
async fn download_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
encryption_key_pair: Option<&EncryptionKeyPair>,
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
|
||||
@@ -171,12 +160,9 @@ async fn download_object(
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut opts = DownloadOpts::default();
|
||||
if let Some(encryption_key_pair) = encryption_key_pair {
|
||||
opts.encryption_key = Some(encryption_key_pair.plain_key.to_vec());
|
||||
}
|
||||
|
||||
let download = storage.download(src_path, &opts, cancel).await?;
|
||||
let download = storage
|
||||
.download(src_path, &DownloadOpts::default(), cancel)
|
||||
.await?;
|
||||
|
||||
pausable_failpoint!("before-downloading-layer-stream-pausable");
|
||||
|
||||
@@ -466,7 +452,7 @@ async fn do_download_index_part(
|
||||
/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back
|
||||
/// to listing objects.
|
||||
///
|
||||
/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
|
||||
/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
|
||||
/// * `what`: for logging, what object are we downloading
|
||||
/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
|
||||
/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
|
||||
|
||||
@@ -10,8 +10,6 @@ use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::base64::Base64;
|
||||
use serde_with::serde_as;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -116,70 +114,6 @@ pub struct IndexPart {
|
||||
/// The timestamp when the timeline was marked invisible in synthetic size calculations.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) marked_invisible_at: Option<NaiveDateTime>,
|
||||
|
||||
/// The encryption key used to encrypt the timeline layer files.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
||||
pub(crate) keys: Vec<EncryptionKey>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Ord, PartialOrd, Hash)]
|
||||
pub struct KeyVersion(pub u32);
|
||||
|
||||
impl KeyVersion {
|
||||
pub fn next(&self) -> Self {
|
||||
Self(self.0 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
/// An identifier for an encryption key. The scope of the key is the timeline (TBD).
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Ord, PartialOrd, Hash)]
|
||||
pub struct EncryptionKeyId {
|
||||
pub version: KeyVersion,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EncryptionKeyPair {
|
||||
pub id: EncryptionKeyId,
|
||||
pub plain_key: Vec<u8>,
|
||||
pub wrapped_key: Vec<u8>,
|
||||
}
|
||||
|
||||
impl EncryptionKeyPair {
|
||||
pub fn new(id: EncryptionKeyId, plain_key: Vec<u8>, wrapped_key: Vec<u8>) -> Self {
|
||||
Self {
|
||||
id,
|
||||
plain_key,
|
||||
wrapped_key,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for EncryptionKeyPair {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let display =
|
||||
base64::display::Base64Display::with_config(&self.wrapped_key, base64::STANDARD);
|
||||
struct DisplayAsDebug<T: std::fmt::Display>(T);
|
||||
impl<T: std::fmt::Display> std::fmt::Debug for DisplayAsDebug<T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
f.debug_struct("EncryptionKeyPair")
|
||||
.field("id", &self.id)
|
||||
.field("plain_key", &"<REDACTED>")
|
||||
.field("wrapped_key", &DisplayAsDebug(&display))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct EncryptionKey {
|
||||
#[serde_as(as = "Base64")]
|
||||
pub key: Vec<u8>,
|
||||
pub id: EncryptionKeyId,
|
||||
pub created_at: NaiveDateTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
@@ -208,12 +142,10 @@ impl IndexPart {
|
||||
/// - 12: +l2_lsn
|
||||
/// - 13: +gc_compaction
|
||||
/// - 14: +marked_invisible_at
|
||||
/// - 15: +keys and encryption_key in layer_metadata
|
||||
const LATEST_VERSION: usize = 15;
|
||||
const LATEST_VERSION: usize = 14;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] =
|
||||
&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -233,7 +165,6 @@ impl IndexPart {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -274,16 +205,14 @@ impl IndexPart {
|
||||
/// Check for invariants in the index: this is useful when uploading an index to ensure that if
|
||||
/// we encounter a bug, we do not persist buggy metadata.
|
||||
pub(crate) fn validate(&self) -> Result<(), String> {
|
||||
// We have to disable this check: we might need to upload an empty index part with new keys, or new `reldirv2` flag.
|
||||
|
||||
// if self.import_pgdata.is_none()
|
||||
// && self.metadata.ancestor_timeline().is_none()
|
||||
// && self.layer_metadata.is_empty()
|
||||
// {
|
||||
// // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
|
||||
// // always have at least one layer.
|
||||
// return Err("Index has no ancestor and no layers".to_string());
|
||||
// }
|
||||
if self.import_pgdata.is_none()
|
||||
&& self.metadata.ancestor_timeline().is_none()
|
||||
&& self.layer_metadata.is_empty()
|
||||
{
|
||||
// Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
|
||||
// always have at least one layer.
|
||||
return Err("Index has no ancestor and no layers".to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -293,7 +222,7 @@ impl IndexPart {
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct LayerFileMetadata {
|
||||
pub file_size: u64,
|
||||
|
||||
@@ -304,9 +233,6 @@ pub struct LayerFileMetadata {
|
||||
#[serde(default = "ShardIndex::unsharded")]
|
||||
#[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
|
||||
pub shard: ShardIndex,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub encryption_key: Option<EncryptionKeyId>,
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
@@ -315,7 +241,6 @@ impl LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
shard,
|
||||
encryption_key: None,
|
||||
}
|
||||
}
|
||||
/// Helper to get both generation and file size in a tuple
|
||||
@@ -528,16 +453,14 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -552,7 +475,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -580,16 +502,14 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -604,7 +524,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -633,16 +552,14 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -657,7 +574,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -711,7 +627,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -738,16 +653,14 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -762,7 +675,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -791,13 +703,11 @@ mod tests {
|
||||
file_size: 23289856,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 1015808,
|
||||
generation: Generation::new(1),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
|
||||
@@ -816,7 +726,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -847,16 +756,14 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -875,7 +782,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -909,14 +815,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -939,7 +843,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -974,14 +877,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1004,7 +905,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1041,14 +941,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1074,7 +972,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1120,14 +1017,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1157,7 +1052,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1204,14 +1098,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1241,7 +1133,6 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1292,14 +1183,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1331,7 +1220,6 @@ mod tests {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: None,
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1383,14 +1271,12 @@ mod tests {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: None,
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
@@ -1422,139 +1308,6 @@ mod tests {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
keys: Vec::new(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v15_keys_are_parsed() {
|
||||
let example = r#"{
|
||||
"version": 15,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000, "encryption_key": { "version": 1, "generation": 5 } },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001, "encryption_key": { "version": 2, "generation": 6 } }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
},
|
||||
"import_pgdata": {
|
||||
"V1": {
|
||||
"Done": {
|
||||
"idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
|
||||
"started_at": "2024-11-13T09:23:42.123",
|
||||
"finished_at": "2024-11-13T09:42:23.123"
|
||||
}
|
||||
}
|
||||
},
|
||||
"rel_size_migration": "legacy",
|
||||
"l2_lsn": "0/16960E8",
|
||||
"gc_compaction": {
|
||||
"last_completed_lsn": "0/16960E8"
|
||||
},
|
||||
"marked_invisible_at": "2023-07-31T09:00:00.123",
|
||||
"keys": [
|
||||
{
|
||||
"key": "dGVzdF9rZXk=",
|
||||
"id": {
|
||||
"version": 1,
|
||||
"generation": 5
|
||||
},
|
||||
"created_at": "2024-07-19T09:00:00.123"
|
||||
},
|
||||
{
|
||||
"key": "dGVzdF9rZXlfMg==",
|
||||
"id": {
|
||||
"version": 2,
|
||||
"generation": 6
|
||||
},
|
||||
"created_at": "2024-07-19T10:00:00.123"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 15,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: Some(EncryptionKeyId {
|
||||
version: KeyVersion(1),
|
||||
generation: Generation::Valid(5),
|
||||
}),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded(),
|
||||
encryption_key: Some(EncryptionKeyId {
|
||||
version: KeyVersion(2),
|
||||
generation: Generation::Valid(6),
|
||||
}),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
|
||||
started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
|
||||
finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
}))),
|
||||
rel_size_migration: Some(RelSizeMigration::Legacy),
|
||||
l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
|
||||
gc_compaction: Some(GcCompactionState {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
keys: vec![
|
||||
EncryptionKey {
|
||||
key: "test_key".as_bytes().to_vec(),
|
||||
id: EncryptionKeyId {
|
||||
version: KeyVersion(1),
|
||||
generation: Generation::Valid(5),
|
||||
},
|
||||
created_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
},
|
||||
EncryptionKey {
|
||||
key: "test_key_2".as_bytes().to_vec(),
|
||||
id: EncryptionKeyId {
|
||||
version: KeyVersion(2),
|
||||
generation: Generation::Valid(6),
|
||||
},
|
||||
created_at: parse_naive_datetime("2024-07-19T10:00:00.123000000"),
|
||||
}
|
||||
],
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -17,7 +17,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
|
||||
use super::Generation;
|
||||
use super::index::{EncryptionKeyPair, IndexPart};
|
||||
use super::index::IndexPart;
|
||||
use super::manifest::TenantManifest;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
|
||||
@@ -101,7 +101,6 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
local_path: &'a Utf8Path,
|
||||
remote_path: &'a RemotePath,
|
||||
metadata_size: u64,
|
||||
encryption_key_pair: Option<EncryptionKeyPair>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
@@ -145,14 +144,7 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload_with_encryption(
|
||||
reader,
|
||||
fs_size,
|
||||
remote_path,
|
||||
None,
|
||||
encryption_key_pair.as_ref().map(|k| k.plain_key.as_slice()),
|
||||
cancel,
|
||||
)
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
@@ -1310,7 +1310,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
let downloaded_bytes = download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
None, // TODO: add encryption key pair
|
||||
*tenant_shard_id,
|
||||
*timeline_id,
|
||||
&layer.name,
|
||||
|
||||
@@ -21,7 +21,7 @@ use super::scheduler::{
|
||||
use super::{CommandRequest, SecondaryTenantError, UploadCommand};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::metrics::SECONDARY_MODE;
|
||||
use crate::tenant::TenantShard;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::mgr::{GetTenantError, TenantManager};
|
||||
use crate::tenant::remote_timeline_client::remote_heatmap_path;
|
||||
@@ -74,7 +74,7 @@ impl RunningJob for WriteInProgress {
|
||||
}
|
||||
|
||||
struct UploadPending {
|
||||
tenant: Arc<TenantShard>,
|
||||
tenant: Arc<Tenant>,
|
||||
last_upload: Option<LastUploadState>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
@@ -106,7 +106,7 @@ impl scheduler::Completion for WriteComplete {
|
||||
struct UploaderTenantState {
|
||||
// This Weak only exists to enable culling idle instances of this type
|
||||
// when the Tenant has been deallocated.
|
||||
tenant: Weak<TenantShard>,
|
||||
tenant: Weak<Tenant>,
|
||||
|
||||
/// Digest of the serialized heatmap that we last successfully uploaded
|
||||
last_upload_state: Option<LastUploadState>,
|
||||
@@ -357,7 +357,7 @@ struct LastUploadState {
|
||||
/// of the object we would have uploaded.
|
||||
async fn upload_tenant_heatmap(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenant: &Arc<TenantShard>,
|
||||
tenant: &Arc<Tenant>,
|
||||
last_upload: Option<LastUploadState>,
|
||||
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -360,7 +360,7 @@ where
|
||||
|
||||
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
|
||||
///
|
||||
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`]
|
||||
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
|
||||
///
|
||||
/// This function resets the pending list: it is assumed that the caller may change their mind about
|
||||
/// which tenants need work between calls to schedule_iteration.
|
||||
|
||||
@@ -12,7 +12,7 @@ use tracing::*;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::{GcError, LogicalSizeCalculationCause, TenantShard};
|
||||
use super::{GcError, LogicalSizeCalculationCause, Tenant};
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::{MaybeOffloaded, Timeline};
|
||||
@@ -156,7 +156,7 @@ pub struct TimelineInputs {
|
||||
/// initdb_lsn branchpoints* next_pitr_cutoff latest
|
||||
/// ```
|
||||
pub(super) async fn gather_inputs(
|
||||
tenant: &TenantShard,
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
max_retention_period: Option<u64>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
|
||||
@@ -715,34 +715,13 @@ pub(crate) enum LayerId {
|
||||
}
|
||||
|
||||
/// Uniquely identify a layer visit by the layer
|
||||
/// and LSN range of the reads. Note that the end of the range is exclusive.
|
||||
///
|
||||
/// The layer itself is not enough since we may have different LSN lower
|
||||
/// bounds for delta layer reads. Scenarios where this can happen are:
|
||||
///
|
||||
/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
|
||||
/// and a query that only partially hits the image layer. Part of the query
|
||||
/// needs to read the whole in-memory layer and the other part needs to read
|
||||
/// only up to the image layer. Hence, they'll have different LSN floor values
|
||||
/// for the read.
|
||||
///
|
||||
/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
|
||||
/// The start LSN for one range is inside a layer and the start LSN for another range
|
||||
/// Is above the layer (includes all of it). Both ranges need to read the layer all the
|
||||
/// Way to the end but starting at different points. Hence, they'll have different LSN
|
||||
/// Ceil values.
|
||||
///
|
||||
/// The implication is that we might visit the same layer multiple times
|
||||
/// in order to read different LSN ranges from it. In practice, this isn't very concerning
|
||||
/// because:
|
||||
/// 1. Layer overlaps are rare and generally not intended
|
||||
/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
|
||||
/// are grouped tightly enough (likely the case).
|
||||
/// and LSN floor (or start LSN) of the reads.
|
||||
/// The layer itself is not enough since we may
|
||||
/// have different LSN lower bounds for delta layer reads.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
struct LayerToVisitId {
|
||||
layer_id: LayerId,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
@@ -826,7 +805,6 @@ impl LayerFringe {
|
||||
let layer_to_visit_id = LayerToVisitId {
|
||||
layer_id: layer.id(),
|
||||
lsn_floor: lsn_range.start,
|
||||
lsn_ceil: lsn_range.end,
|
||||
};
|
||||
|
||||
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::sync::Arc;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{KEY_SIZE, Key};
|
||||
use pageserver_api::value::Value;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::TenantShardId;
|
||||
@@ -180,7 +179,7 @@ impl BatchLayerWriter {
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers.
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter<'a> {
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
lsn: Lsn,
|
||||
@@ -189,12 +188,9 @@ pub struct SplitImageLayerWriter<'a> {
|
||||
tenant_shard_id: TenantShardId,
|
||||
batches: BatchLayerWriter,
|
||||
start_key: Key,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl<'a> SplitImageLayerWriter<'a> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
impl SplitImageLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -202,8 +198,6 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
start_key: Key,
|
||||
lsn: Lsn,
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
@@ -214,8 +208,6 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
tenant_shard_id,
|
||||
&(start_key..Key::MAX),
|
||||
lsn,
|
||||
gate,
|
||||
cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -225,8 +217,6 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
lsn,
|
||||
start_key,
|
||||
gate,
|
||||
cancel,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -249,8 +239,6 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
self.tenant_shard_id,
|
||||
&(key..Key::MAX),
|
||||
self.lsn,
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -303,7 +291,7 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter<'a> {
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: Option<(Key, DeltaLayerWriter)>,
|
||||
target_layer_size: u64,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -312,19 +300,15 @@ pub struct SplitDeltaLayerWriter<'a> {
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
batches: BatchLayerWriter,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
impl SplitDeltaLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
@@ -335,8 +319,6 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
gate,
|
||||
cancel,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -362,8 +344,6 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -382,8 +362,6 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -491,8 +469,6 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -504,8 +480,6 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -572,8 +546,6 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -584,8 +556,6 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -673,8 +643,6 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -686,8 +654,6 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -764,8 +730,6 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -50,7 +50,6 @@ use rand::distributions::Alphanumeric;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -401,15 +400,12 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
@@ -424,7 +420,7 @@ impl DeltaLayerWriterInner {
|
||||
let mut file = VirtualFile::create(&path, ctx).await?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -632,15 +628,12 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
@@ -651,8 +644,6 @@ impl DeltaLayerWriter {
|
||||
tenant_shard_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -1620,7 +1611,7 @@ pub(crate) mod test {
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1894,8 +1885,6 @@ pub(crate) mod test {
|
||||
harness.tenant_shard_id,
|
||||
entries_meta.key_range.start,
|
||||
entries_meta.lsn_range.clone(),
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2090,8 +2079,6 @@ pub(crate) mod test {
|
||||
tenant.tenant_shard_id,
|
||||
Key::MIN,
|
||||
Lsn(0x11)..truncate_at,
|
||||
&branch.gate,
|
||||
branch.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2209,7 +2196,7 @@ pub(crate) mod test {
|
||||
}
|
||||
|
||||
pub(crate) async fn produce_delta_layer(
|
||||
tenant: &TenantShard,
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
@@ -2226,8 +2213,6 @@ pub(crate) mod test {
|
||||
tenant.tenant_shard_id,
|
||||
*key_start,
|
||||
(*lsn_min)..lsn_end,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -48,7 +48,6 @@ use rand::distributions::Alphanumeric;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -559,12 +558,11 @@ impl ImageLayerInner {
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
// Just read the raw header+data and pass it through to the target layer, without
|
||||
// decoding and recompressing it.
|
||||
let raw = meta.raw_with_header(&view);
|
||||
let img_buf = meta.read(&view).await?;
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image_raw(meta.meta.key, raw.into_bytes(), ctx)
|
||||
.put_image(meta.meta.key, img_buf.into_bytes(), ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
@@ -750,15 +748,12 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
@@ -785,7 +780,7 @@ impl ImageLayerWriterInner {
|
||||
};
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -854,41 +849,6 @@ impl ImageLayerWriterInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Write the next image to the file, as a raw blob header and data.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
async fn put_image_raw(
|
||||
&mut self,
|
||||
key: Key,
|
||||
raw_with_header: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
|
||||
// NB: we don't update the (un)compressed metrics, since we can't determine them without
|
||||
// decompressing the image. This seems okay.
|
||||
self.num_keys += 1;
|
||||
|
||||
let (_, res) = self
|
||||
.blob_writer
|
||||
.write_blob_raw(raw_with_header.slice_len(), ctx)
|
||||
.await;
|
||||
let offset = res?;
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
self.tree.append(&keybuf, offset)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
self.last_written_key = key;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
@@ -924,13 +884,7 @@ impl ImageLayerWriterInner {
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
|
||||
.inc_by(self.uncompressed_bytes_eligible);
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
|
||||
|
||||
// NB: filter() may pass through raw pages from a different layer, without looking at
|
||||
// whether these are compressed or not. We don't track metrics for these, so avoid
|
||||
// increasing `COMPRESSION_IMAGE_OUTPUT_BYTES` in this case too.
|
||||
if self.uncompressed_bytes > 0 {
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
};
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
|
||||
@@ -1034,30 +988,18 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
key_range,
|
||||
lsn,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
|
||||
.await?,
|
||||
),
|
||||
})
|
||||
}
|
||||
@@ -1076,25 +1018,6 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
///
|
||||
/// Write the next value to the file, as a raw header and data. This allows passing through a
|
||||
/// raw, potentially compressed image from a different layer file without recompressing it.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub async fn put_image_raw(
|
||||
&mut self,
|
||||
key: Key,
|
||||
raw_with_header: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_image_raw(key, raw_with_header, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
@@ -1228,7 +1151,7 @@ mod test {
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
@@ -1269,7 +1192,7 @@ mod test {
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
@@ -1280,8 +1203,6 @@ mod test {
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1314,7 +1235,7 @@ mod test {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
shard_count,
|
||||
ShardStripeSize(0x800),
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
let harness = TenantHarness::create_custom(
|
||||
@@ -1347,8 +1268,6 @@ mod test {
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1368,12 +1287,12 @@ mod test {
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 122880);
|
||||
assert_eq!(original_size, 1597440);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
@@ -1381,7 +1300,7 @@ mod test {
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
@@ -1390,19 +1309,19 @@ mod test {
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x1000);
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 73728);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
@@ -1410,7 +1329,7 @@ mod test {
|
||||
}
|
||||
|
||||
async fn produce_image_layer(
|
||||
tenant: &TenantShard,
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut images: Vec<(Key, Bytes)>,
|
||||
lsn: Lsn,
|
||||
@@ -1427,8 +1346,6 @@ mod test {
|
||||
tenant.tenant_shard_id,
|
||||
&key_range,
|
||||
lsn,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user