Compare commits

..

2 Commits

Author SHA1 Message Date
John Spray
ed3e3b6f61 pageserver: enable setting a target disk range 2023-10-25 14:39:12 +01:00
John Spray
098ef0956b pageserver: publish disk eviction status 2023-10-25 14:35:32 +01:00
222 changed files with 9612 additions and 12690 deletions

View File

@@ -22,11 +22,5 @@ platforms = [
# "x86_64-pc-windows-msvc", # "x86_64-pc-windows-msvc",
] ]
[final-excludes]
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
# from depending on workspace-hack because most of the dependencies are not used.
workspace-members = ["vm_monitor"]
# Write out exact versions rather than a semver range. (Defaults to false.) # Write out exact versions rather than a semver range. (Defaults to false.)
# exact-versions = true # exact-versions = true

View File

@@ -17,9 +17,8 @@ assignees: ''
## Implementation ideas ## Implementation ideas
```[tasklist] ## Tasks
### Tasks - [ ]
```
## Other related tasks and Epics ## Other related tasks and Epics

View File

@@ -5,6 +5,4 @@ self-hosted-runner:
- small - small
- us-east-2 - us-east-2
config-variables: config-variables:
- REMOTE_STORAGE_AZURE_CONTAINER
- REMOTE_STORAGE_AZURE_REGION
- SLACK_UPCOMING_RELEASE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID

View File

@@ -203,10 +203,6 @@ runs:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }} BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
run: | run: |
if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
exit 0
fi
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW} export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
./scripts/pysync ./scripts/pysync

View File

@@ -340,11 +340,11 @@ jobs:
# Run separate tests for real Azure Blob Storage # Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region # XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
export REMOTE_STORAGE_AZURE_REGION=eastus2
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
@@ -433,7 +433,7 @@ jobs:
rerun_flaky: true rerun_flaky: true
pg_version: ${{ matrix.pg_version }} pg_version: ${{ matrix.pg_version }}
env: env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
- name: Merge and upload coverage data - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
env: env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds, # XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones # while coverage is currently collected for the debug ones
@@ -723,7 +723,6 @@ jobs:
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context . --context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -848,7 +847,7 @@ jobs:
run: run:
shell: sh -eu {0} shell: sh -eu {0}
env: env:
VM_BUILDER_VERSION: v0.18.5 VM_BUILDER_VERSION: v0.18.2
steps: steps:
- name: Checkout - name: Checkout

View File

@@ -2,7 +2,7 @@ name: Create Release Branch
on: on:
schedule: schedule:
- cron: '0 7 * * 5' - cron: '0 7 * * 2'
workflow_dispatch: workflow_dispatch:
jobs: jobs:

56
Cargo.lock generated
View File

@@ -170,12 +170,6 @@ dependencies = [
"backtrace", "backtrace",
] ]
[[package]]
name = "arc-swap"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
[[package]] [[package]]
name = "archery" name = "archery"
version = "0.5.0" version = "0.5.0"
@@ -1615,6 +1609,16 @@ dependencies = [
"subtle", "subtle",
] ]
[[package]]
name = "ctor"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
dependencies = [
"quote",
"syn 1.0.109",
]
[[package]] [[package]]
name = "ctr" name = "ctr"
version = "0.6.0" version = "0.6.0"
@@ -2710,10 +2714,11 @@ dependencies = [
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.20" version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [ dependencies = [
"cfg-if",
"value-bag", "value-bag",
] ]
@@ -3556,7 +3561,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres" name = "postgres"
version = "0.19.4" version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048" source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
dependencies = [ dependencies = [
"bytes", "bytes",
"fallible-iterator", "fallible-iterator",
@@ -3569,7 +3574,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres-native-tls" name = "postgres-native-tls"
version = "0.5.0" version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048" source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
dependencies = [ dependencies = [
"native-tls", "native-tls",
"tokio", "tokio",
@@ -3580,7 +3585,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres-protocol" name = "postgres-protocol"
version = "0.6.4" version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048" source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
dependencies = [ dependencies = [
"base64 0.20.0", "base64 0.20.0",
"byteorder", "byteorder",
@@ -3598,7 +3603,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres-types" name = "postgres-types"
version = "0.2.4" version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048" source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
dependencies = [ dependencies = [
"bytes", "bytes",
"fallible-iterator", "fallible-iterator",
@@ -4064,7 +4069,6 @@ dependencies = [
"aws-config", "aws-config",
"aws-credential-types", "aws-credential-types",
"aws-sdk-s3", "aws-sdk-s3",
"aws-smithy-async",
"aws-smithy-http", "aws-smithy-http",
"aws-types", "aws-types",
"azure_core", "azure_core",
@@ -4426,7 +4430,6 @@ dependencies = [
"itertools", "itertools",
"pageserver", "pageserver",
"rand 0.8.5", "rand 0.8.5",
"remote_storage",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
@@ -4485,7 +4488,6 @@ dependencies = [
"tokio", "tokio",
"tokio-io-timeout", "tokio-io-timeout",
"tokio-postgres", "tokio-postgres",
"tokio-stream",
"toml_edit", "toml_edit",
"tracing", "tracing",
"url", "url",
@@ -4688,16 +4690,6 @@ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]]
name = "serde_assert"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
dependencies = [
"hashbrown 0.13.2",
"serde",
]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.183" version = "1.0.183"
@@ -5415,7 +5407,7 @@ dependencies = [
[[package]] [[package]]
name = "tokio-postgres" name = "tokio-postgres"
version = "0.7.7" version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048" source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"byteorder", "byteorder",
@@ -5958,7 +5950,6 @@ name = "utils"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-swap",
"async-trait", "async-trait",
"bincode", "bincode",
"byteorder", "byteorder",
@@ -5985,7 +5976,6 @@ dependencies = [
"routerify", "routerify",
"sentry", "sentry",
"serde", "serde",
"serde_assert",
"serde_json", "serde_json",
"serde_with", "serde_with",
"signal-hook", "signal-hook",
@@ -6021,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]] [[package]]
name = "value-bag" name = "value-bag"
version = "1.4.2" version = "1.0.0-alpha.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe" checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
dependencies = [
"ctor",
"version_check",
]
[[package]] [[package]]
name = "vcpkg" name = "vcpkg"
@@ -6056,6 +6050,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"workspace_hack",
] ]
[[package]] [[package]]
@@ -6483,7 +6478,6 @@ dependencies = [
"clap", "clap",
"clap_builder", "clap_builder",
"crossbeam-utils", "crossbeam-utils",
"dashmap",
"either", "either",
"fail", "fail",
"futures", "futures",

View File

@@ -36,7 +36,6 @@ license = "Apache-2.0"
## All dependency versions, used in the project ## All dependency versions, used in the project
[workspace.dependencies] [workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] } anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] } async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
azure_core = "0.16" azure_core = "0.16"
azure_identity = "0.16" azure_identity = "0.16"
@@ -48,7 +47,6 @@ async-trait = "0.1"
aws-config = { version = "0.56", default-features = false, features=["rustls"] } aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29" aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56" aws-smithy-http = "0.56"
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
aws-credential-types = "0.56" aws-credential-types = "0.56"
aws-types = "0.56" aws-types = "0.56"
axum = { version = "0.6.20", features = ["ws"] } axum = { version = "0.6.20", features = ["ws"] }
@@ -67,7 +65,7 @@ comfy-table = "6.1"
const_format = "0.2" const_format = "0.2"
crc32c = "0.6" crc32c = "0.6"
crossbeam-utils = "0.8.5" crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] } dashmap = "5.5.0"
either = "1.8" either = "1.8"
enum-map = "2.4.2" enum-map = "2.4.2"
enumset = "1.0.12" enumset = "1.0.12"
@@ -126,7 +124,6 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1" serde_json = "1"
serde_with = "2.0" serde_with = "2.0"
serde_assert = "0.5.0"
sha2 = "0.10.2" sha2 = "0.10.2"
signal-hook = "0.3" signal-hook = "0.3"
smallvec = "1.11" smallvec = "1.11"
@@ -164,11 +161,11 @@ env_logger = "0.10"
log = "0.4" log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
## Other git libraries ## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -205,7 +202,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests. # This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead. # TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
################# Binary contents sections ################# Binary contents sections

View File

@@ -27,7 +27,6 @@ RUN set -e \
FROM $REPOSITORY/$IMAGE:$TAG AS build FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot WORKDIR /home/nonroot
ARG GIT_VERSION=local ARG GIT_VERSION=local
ARG BUILD_TAG
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -79,9 +78,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

View File

@@ -72,10 +72,6 @@ neon: postgres-headers walproposer-lib
# #
$(POSTGRES_INSTALL_DIR)/build/%/config.status: $(POSTGRES_INSTALL_DIR)/build/%/config.status:
+@echo "Configuring Postgres $* build" +@echo "Configuring Postgres $* build"
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \ (cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \

View File

@@ -156,7 +156,6 @@ fn main() -> Result<()> {
let path = Path::new(sp); let path = Path::new(sp);
let file = File::open(path)?; let file = File::open(path)?;
spec = Some(serde_json::from_reader(file)?); spec = Some(serde_json::from_reader(file)?);
live_config_allowed = true;
} else if let Some(id) = compute_id { } else if let Some(id) = compute_id {
if let Some(cp_base) = control_plane_uri { if let Some(cp_base) = control_plane_uri {
live_config_allowed = true; live_config_allowed = true;
@@ -278,26 +277,32 @@ fn main() -> Result<()> {
if #[cfg(target_os = "linux")] { if #[cfg(target_os = "linux")] {
use std::env; use std::env;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
let vm_monitor_addr = matches use tracing::warn;
.get_one::<String>("vm-monitor-addr") let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
.expect("--vm-monitor-addr should always be set because it has a default arg");
let file_cache_connstr = matches.get_one::<String>("filecache-connstr"); let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
let cgroup = matches.get_one::<String>("cgroup"); let cgroup = matches.get_one::<String>("cgroup");
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
// Only make a runtime if we need to. // Only make a runtime if we need to.
// Note: it seems like you can make a runtime in an inner scope and // Note: it seems like you can make a runtime in an inner scope and
// if you start a task in it it won't be dropped. However, make it // if you start a task in it it won't be dropped. However, make it
// in the outermost scope just to be safe. // in the outermost scope just to be safe.
let rt = if env::var_os("AUTOSCALING").is_some() { let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
Some( (None, None) => None,
(None, Some(_)) => {
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
None
}
(Some(_), None) => {
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
}
(Some(_), Some(_)) => Some(
tokio::runtime::Builder::new_multi_thread() tokio::runtime::Builder::new_multi_thread()
.worker_threads(4) .worker_threads(4)
.enable_all() .enable_all()
.build() .build()
.expect("failed to create tokio runtime for monitor") .expect("failed to create tokio runtime for monitor"),
) ),
} else {
None
}; };
// This token is used internally by the monitor to clean up all threads // This token is used internally by the monitor to clean up all threads
@@ -308,7 +313,8 @@ fn main() -> Result<()> {
Box::leak(Box::new(vm_monitor::Args { Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(), cgroup: cgroup.cloned(),
pgconnstr: file_cache_connstr.cloned(), pgconnstr: file_cache_connstr.cloned(),
addr: vm_monitor_addr.clone(), addr: vm_monitor_addr.cloned().unwrap(),
file_cache_on_disk,
})), })),
token.clone(), token.clone(),
)) ))
@@ -480,8 +486,6 @@ fn cli() -> clap::Command {
.value_name("FILECACHE_CONNSTR"), .value_name("FILECACHE_CONNSTR"),
) )
.arg( .arg(
// DEPRECATED, NO LONGER DOES ANYTHING.
// See https://github.com/neondatabase/cloud/issues/7516
Arg::new("file-cache-on-disk") Arg::new("file-cache-on-disk")
.long("file-cache-on-disk") .long("file-cache-on-disk")
.action(clap::ArgAction::SetTrue), .action(clap::ArgAction::SetTrue),

View File

@@ -710,12 +710,8 @@ impl ComputeNode {
// `pg_ctl` for start / stop, so this just seems much easier to do as we already // `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access. // have opened connection to Postgres and superuser access.
#[instrument(skip_all)] #[instrument(skip_all)]
fn pg_reload_conf(&self) -> Result<()> { fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); client.simple_query("SELECT pg_reload_conf()")?;
Command::new(pgctl_bin)
.args(["reload", "-D", &self.pgdata])
.output()
.expect("cannot run pg_ctl process");
Ok(()) Ok(())
} }
@@ -728,9 +724,9 @@ impl ComputeNode {
// Write new config // Write new config
let pgdata_path = Path::new(&self.pgdata); let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
self.pg_reload_conf()?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?; let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases. // Disable DDL forwarding because control plane already knows about these roles/databases.

View File

@@ -78,7 +78,7 @@ use regex::Regex;
use remote_storage::*; use remote_storage::*;
use serde_json; use serde_json;
use std::io::Read; use std::io::Read;
use std::num::NonZeroUsize; use std::num::{NonZeroU32, NonZeroUsize};
use std::path::Path; use std::path::Path;
use std::str; use std::str;
use tar::Archive; use tar::Archive;
@@ -281,6 +281,8 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
max_keys_per_list_response: None, max_keys_per_list_response: None,
}; };
let config = RemoteStorageConfig { let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config), storage: RemoteStorageKind::AwsS3(config),
}; };
GenericRemoteStorage::from_config(&config) GenericRemoteStorage::from_config(&config)

View File

@@ -1,7 +1,7 @@
//!
//! Various tools and helpers to handle cluster / compute node (Postgres) //! Various tools and helpers to handle cluster / compute node (Postgres)
//! configuration. //! configuration.
#![deny(unsafe_code)] //!
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod checker; pub mod checker;
pub mod config; pub mod config;
pub mod configurator; pub mod configurator;

View File

@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
/// Build a list of existing Postgres roles /// Build a list of existing Postgres roles
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> { pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
let postgres_roles = xact let postgres_roles = xact
.query( .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
&[],
)?
.iter() .iter()
.map(|row| Role { .map(|row| Role {
name: row.get("rolname"), name: row.get("rolname"),
encrypted_password: row.get("rolpassword"), encrypted_password: row.get("rolpassword"),
replication: Some(row.get("rolreplication")),
bypassrls: Some(row.get("rolbypassrls")),
options: None, options: None,
}) })
.collect(); .collect();

View File

@@ -24,7 +24,7 @@ fn do_control_plane_request(
) -> Result<ControlPlaneSpecResponse, (bool, String)> { ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
let resp = reqwest::blocking::Client::new() let resp = reqwest::blocking::Client::new()
.get(uri) .get(uri)
.header("Authorization", format!("Bearer {}", jwt)) .header("Authorization", jwt)
.send() .send()
.map_err(|e| { .map_err(|e| {
( (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
base_uri: &str, base_uri: &str,
compute_id: &str, compute_id: &str,
) -> Result<Option<ComputeSpec>> { ) -> Result<Option<ComputeSpec>> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v, Ok(v) => v,
Err(_) => "".to_string(), Err(_) => "".to_string(),
@@ -265,8 +265,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let action = if let Some(r) = pg_role { let action = if let Some(r) = pg_role {
if (r.encrypted_password.is_none() && role.encrypted_password.is_some()) if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none()) || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|| !r.bypassrls.unwrap_or(false)
|| !r.replication.unwrap_or(false)
{ {
RoleAction::Update RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password { } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,8 +296,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
match action { match action {
RoleAction::None => {} RoleAction::None => {}
RoleAction::Update => { RoleAction::Update => {
let mut query: String = let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
query.push_str(&role.to_pg_options()); query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?; xact.execute(query.as_str(), &[])?;
} }

View File

@@ -2,6 +2,7 @@ use crate::{background_process, local_env::LocalEnv};
use anyhow::anyhow; use anyhow::anyhow;
use camino::Utf8PathBuf; use camino::Utf8PathBuf;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::{path::PathBuf, process::Child}; use std::{path::PathBuf, process::Child};
use utils::id::{NodeId, TenantId}; use utils::id::{NodeId, TenantId};
@@ -13,10 +14,12 @@ pub struct AttachmentService {
const COMMAND: &str = "attachment_service"; const COMMAND: &str = "attachment_service";
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct AttachHookRequest { pub struct AttachHookRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
pub node_id: Option<NodeId>, pub pageserver_id: Option<NodeId>,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@@ -82,7 +85,7 @@ impl AttachmentService {
.control_plane_api .control_plane_api
.clone() .clone()
.unwrap() .unwrap()
.join("attach-hook") .join("attach_hook")
.unwrap(); .unwrap();
let client = reqwest::blocking::ClientBuilder::new() let client = reqwest::blocking::ClientBuilder::new()
.build() .build()
@@ -90,7 +93,7 @@ impl AttachmentService {
let request = AttachHookRequest { let request = AttachHookRequest {
tenant_id, tenant_id,
node_id: Some(pageserver_id), pageserver_id: Some(pageserver_id),
}; };
let response = client.post(url).json(&request).send()?; let response = client.post(url).json(&request).send()?;

View File

@@ -262,7 +262,7 @@ where
P: Into<Utf8PathBuf>, P: Into<Utf8PathBuf>,
{ {
let path: Utf8PathBuf = path.into(); let path: Utf8PathBuf = path.into();
// SAFETY: // SAFETY
// pre_exec is marked unsafe because it runs between fork and exec. // pre_exec is marked unsafe because it runs between fork and exec.
// Why is that dangerous in various ways? // Why is that dangerous in various ways?
// Long answer: https://github.com/rust-lang/rust/issues/39575 // Long answer: https://github.com/rust-lang/rust/issues/39575

View File

@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::{collections::HashMap, sync::Arc}; use std::{collections::HashMap, sync::Arc};
use utils::http::endpoint::request_span;
use utils::logging::{self, LogFormat}; use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal}; use utils::signals::{ShutdownSignals, Signal};
@@ -172,7 +171,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
state.generation += 1; state.generation += 1;
response.tenants.push(ReAttachResponseTenant { response.tenants.push(ReAttachResponseTenant {
id: *t, id: *t,
gen: state.generation, generation: state.generation,
}); });
} }
} }
@@ -218,31 +217,14 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
.tenants .tenants
.entry(attach_req.tenant_id) .entry(attach_req.tenant_id)
.or_insert_with(|| TenantState { .or_insert_with(|| TenantState {
pageserver: attach_req.node_id, pageserver: attach_req.pageserver_id,
generation: 0, generation: 0,
}); });
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { if attach_req.pageserver_id.is_some() {
tenant_state.generation += 1; tenant_state.generation += 1;
tracing::info!(
tenant_id = %attach_req.tenant_id,
ps_id = %attaching_pageserver,
generation = %tenant_state.generation,
"issuing",
);
} else if let Some(ps_id) = tenant_state.pageserver {
tracing::info!(
tenant_id = %attach_req.tenant_id,
%ps_id,
generation = %tenant_state.generation,
"dropping",
);
} else {
tracing::info!(
tenant_id = %attach_req.tenant_id,
"no-op: tenant already has no pageserver");
} }
tenant_state.pageserver = attach_req.node_id; tenant_state.pageserver = attach_req.pageserver_id;
let generation = tenant_state.generation; let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?; locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -250,7 +232,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
json_response( json_response(
StatusCode::OK, StatusCode::OK,
AttachHookResponse { AttachHookResponse {
gen: attach_req.node_id.map(|_| generation), gen: attach_req.pageserver_id.map(|_| generation),
}, },
) )
} }
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> { fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router() endpoint::make_router()
.data(Arc::new(State::new(persistent_state))) .data(Arc::new(State::new(persistent_state)))
.post("/re-attach", |r| request_span(r, handle_re_attach)) .post("/re-attach", handle_re_attach)
.post("/validate", |r| request_span(r, handle_validate)) .post("/validate", handle_validate)
.post("/attach-hook", |r| request_span(r, handle_attach_hook)) .post("/attach_hook", handle_attach_hook)
} }
#[tokio::main] #[tokio::main]

View File

@@ -798,24 +798,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
ep.start(&auth_token, safekeepers, remote_ext_config)?; ep.start(&auth_token, safekeepers, remote_ext_config)?;
} }
} }
"reconfigure" => {
let endpoint_id = sub_args
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
let endpoint = cplane
.endpoints
.get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let pageserver_id =
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
Some(NodeId(
id_str.parse().context("while parsing pageserver id")?,
))
} else {
None
};
endpoint.reconfigure(pageserver_id)?;
}
"stop" => { "stop" => {
let endpoint_id = sub_args let endpoint_id = sub_args
.get_one::<String>("endpoint_id") .get_one::<String>("endpoint_id")
@@ -1387,12 +1369,6 @@ fn cli() -> Command {
.arg(safekeepers_arg) .arg(safekeepers_arg)
.arg(remote_ext_config_args) .arg(remote_ext_config_args)
) )
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")
.arg(endpoint_pageserver_id_arg)
.arg(endpoint_id_arg.clone())
.arg(tenant_id_arg.clone())
)
.subcommand( .subcommand(
Command::new("stop") Command::new("stop")
.arg(endpoint_id_arg) .arg(endpoint_id_arg)

View File

@@ -46,6 +46,7 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Context, Result};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId, TimelineId}; use utils::id::{NodeId, TenantId, TimelineId};
use crate::local_env::LocalEnv; use crate::local_env::LocalEnv;
@@ -56,10 +57,13 @@ use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec}; use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file // contents of a endpoint.json file
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf { pub struct EndpointConf {
endpoint_id: String, endpoint_id: String,
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId, tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
timeline_id: TimelineId, timeline_id: TimelineId,
mode: ComputeMode, mode: ComputeMode,
pg_port: u16, pg_port: u16,
@@ -410,32 +414,16 @@ impl Endpoint {
); );
} }
Ok(()) // Also wait for the compute_ctl process to die. It might have some cleanup
} // work to do after postgres stops, like syncing safekeepers, etc.
//
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
// TODO use background_process::stop_process instead // TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32); let pid = nix::unistd::Pid::from_raw(pid as i32);
crate::background_process::wait_until_stopped("compute_ctl", pid)?; crate::background_process::wait_until_stopped("compute_ctl", pid)?;
Ok(())
}
fn read_postgresql_conf(&self) -> Result<String> { Ok(())
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
// memory. We will include it in the spec file that we pass to
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
// in the data directory.
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
match std::fs::read(&postgresql_conf_path) {
Ok(content) => Ok(String::from_utf8(content)?),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
Err(e) => Err(anyhow::Error::new(e).context(format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
))),
}
} }
pub fn start( pub fn start(
@@ -448,7 +436,21 @@ impl Endpoint {
anyhow::bail!("The endpoint is already running"); anyhow::bail!("The endpoint is already running");
} }
let postgresql_conf = self.read_postgresql_conf()?; // Slurp the endpoints/<endpoint id>/postgresql.conf file into
// memory. We will include it in the spec file that we pass to
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
// in the data directory.
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
Ok(content) => String::from_utf8(content)?,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
Err(e) => {
return Err(anyhow::Error::new(e).context(format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)))
}
};
// We always start the compute node from scratch, so if the Postgres // We always start the compute node from scratch, so if the Postgres
// data dir exists from a previous launch, remove it first. // data dir exists from a previous launch, remove it first.
@@ -619,61 +621,6 @@ impl Endpoint {
} }
} }
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
serde_json::from_reader(file)?
};
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
if let Some(pageserver_id) = pageserver_id {
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
let mut endpoint_conf: EndpointConf = {
let file = std::fs::File::open(&endpoint_config_path)?;
serde_json::from_reader(file)?
};
endpoint_conf.pageserver_id = pageserver_id;
std::fs::write(
endpoint_config_path,
serde_json::to_string_pretty(&endpoint_conf)?,
)?;
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ps_http_conf = &pageserver.pg_connection_config;
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
}
let client = reqwest::blocking::Client::new();
let response = client
.post(format!(
"http://{}:{}/configure",
self.http_address.ip(),
self.http_address.port()
))
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.send()?;
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(())
} else {
let url = response.url().to_owned();
let msg = match response.text() {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
Err(anyhow::anyhow!(msg))
}
}
pub fn stop(&self, destroy: bool) -> Result<()> { pub fn stop(&self, destroy: bool) -> Result<()> {
// If we are going to destroy data directory, // If we are going to destroy data directory,
// use immediate shutdown mode, otherwise, // use immediate shutdown mode, otherwise,
@@ -682,25 +629,15 @@ impl Endpoint {
// Postgres is always started from scratch, so stop // Postgres is always started from scratch, so stop
// without destroy only used for testing and debugging. // without destroy only used for testing and debugging.
// //
self.pg_ctl(
if destroy {
&["-m", "immediate", "stop"]
} else {
&["stop"]
},
&None,
)?;
// Also wait for the compute_ctl process to die. It might have some cleanup
// work to do after postgres stops, like syncing safekeepers, etc.
//
self.wait_for_compute_ctl_to_exit()?;
if destroy { if destroy {
self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
println!( println!(
"Destroying postgres data directory '{}'", "Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap() self.pgdata().to_str().unwrap()
); );
std::fs::remove_dir_all(self.endpoint_path())?; std::fs::remove_dir_all(self.endpoint_path())?;
} else {
self.pg_ctl(&["stop"], &None)?;
} }
Ok(()) Ok(())
} }

View File

@@ -1,10 +1,11 @@
//! Local control plane. //
//! // Local control plane.
//! Can start, configure and stop postgres instances running as a local processes. //
//! // Can start, configure and stop postgres instances running as a local processes.
//! Intended to be used in integration tests and in CLI tools for //
//! local installations. // Intended to be used in integration tests and in CLI tools for
#![deny(clippy::undocumented_unsafe_blocks)] // local installations.
//
pub mod attachment_service; pub mod attachment_service;
mod background_process; mod background_process;

View File

@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context};
use postgres_backend::AuthType; use postgres_backend::AuthType;
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap; use std::collections::HashMap;
use std::env; use std::env;
use std::fs; use std::fs;
@@ -32,6 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
// an example. // an example.
// //
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv { pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and // Base directory for all the nodes (the pageserver, safekeepers and
@@ -57,6 +59,7 @@ pub struct LocalEnv {
// Default tenant ID to use with the 'neon_local' command line utility, when // Default tenant ID to use with the 'neon_local' command line utility, when
// --tenant_id is not explicitly specified. // --tenant_id is not explicitly specified.
#[serde(default)] #[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub default_tenant_id: Option<TenantId>, pub default_tenant_id: Option<TenantId>,
// used to issue tokens during e.g pg start // used to issue tokens during e.g pg start
@@ -81,6 +84,7 @@ pub struct LocalEnv {
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here, // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
#[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>, branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
} }

View File

@@ -1,108 +0,0 @@
# Updating Postgres
## Minor Versions
When upgrading to a new minor version of Postgres, please follow these steps:
_Example: 15.4 is the new minor version to upgrade to from 15.3._
1. Clone the Neon Postgres repository if you have not done so already.
```shell
git clone git@github.com:neondatabase/postgres.git
```
1. Add the Postgres upstream remote.
```shell
git remote add upstream https://git.postgresql.org/git/postgresql.git
```
1. Create a new branch based on the stable branch you are updating.
```shell
git checkout -b my-branch REL_15_STABLE_neon
```
1. Tag the last commit on the stable branch you are updating.
```shell
git tag REL_15_3_neon
```
1. Push the new tag to the Neon Postgres repository.
```shell
git push origin REL_15_3_neon
```
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
1. Rebase the branch you created on the tag and resolve any conflicts.
```shell
git fetch upstream REL_15_4
git rebase REL_15_4
```
1. Run the Postgres test suite to make sure our commits have not affected
Postgres in a negative way.
```shell
make check
# OR
meson test -C builddir
```
1. Push your branch to the Neon Postgres repository.
```shell
git push origin my-branch
```
1. Clone the Neon repository if you have not done so already.
```shell
git clone git@github.com:neondatabase/neon.git
```
1. Create a new branch.
1. Change the `revisions.json` file to point at the HEAD of your Postgres
branch.
1. Update the Git submodule.
```shell
git submodule set-branch --branch my-branch vendor/postgres-v15
git submodule update --remote vendor/postgres-v15
```
1. Run the Neon test suite to make sure that Neon is still good to go on this
minor Postgres release.
```shell
./scripts/poetry -k pg15
```
1. Commit your changes.
1. Create a pull request, and wait for CI to go green.
1. Force push the rebased Postgres branches into the Neon Postgres repository.
```shell
git push --force origin my-branch:REL_15_STABLE_neon
```
It may require disabling various branch protections.
1. Update your Neon PR to point at the branches.
```shell
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
git commit --amend --no-edit
git push --force origin
```
1. Merge the pull request after getting approval(s) and CI completion.

View File

@@ -1,5 +1,3 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod requests; pub mod requests;
pub mod responses; pub mod responses;
pub mod spec; pub mod spec;

View File

@@ -6,6 +6,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn; use utils::lsn::Lsn;
@@ -18,6 +19,7 @@ pub type PgIdent = String;
/// Cluster spec or configuration represented as an optional number of /// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description. /// delta operations + final cluster state description.
#[serde_as]
#[derive(Clone, Debug, Default, Deserialize, Serialize)] #[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct ComputeSpec { pub struct ComputeSpec {
pub format_version: f32, pub format_version: f32,
@@ -48,12 +50,12 @@ pub struct ComputeSpec {
// these, and instead set the "neon.tenant_id", "neon.timeline_id", // these, and instead set the "neon.tenant_id", "neon.timeline_id",
// etc. GUCs in cluster.settings. TODO: Once the control plane has been // etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional. // updated to fill these fields, we can make these non optional.
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>, pub tenant_id: Option<TenantId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub timeline_id: Option<TimelineId>, pub timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub pageserver_connstring: Option<String>, pub pageserver_connstring: Option<String>,
#[serde(default)] #[serde(default)]
pub safekeeper_connstrings: Vec<String>, pub safekeeper_connstrings: Vec<String>,
@@ -138,13 +140,14 @@ impl RemoteExtSpec {
} }
} }
#[serde_as]
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeMode { pub enum ComputeMode {
/// A read-write node /// A read-write node
#[default] #[default]
Primary, Primary,
/// A read-only node, pinned at a particular LSN /// A read-only node, pinned at a particular LSN
Static(Lsn), Static(#[serde_as(as = "DisplayFromStr")] Lsn),
/// A read-only node that follows the tip of the branch in hot standby mode /// A read-only node that follows the tip of the branch in hot standby mode
/// ///
/// Future versions may want to distinguish between replicas with hot standby /// Future versions may want to distinguish between replicas with hot standby
@@ -187,8 +190,6 @@ pub struct DeltaOp {
pub struct Role { pub struct Role {
pub name: PgIdent, pub name: PgIdent,
pub encrypted_password: Option<String>, pub encrypted_password: Option<String>,
pub replication: Option<bool>,
pub bypassrls: Option<bool>,
pub options: GenericOptions, pub options: GenericOptions,
} }

View File

@@ -1,6 +1,6 @@
//!
//! Shared code for consumption metics collection //! Shared code for consumption metics collection
#![deny(unsafe_code)] //!
#![deny(clippy::undocumented_unsafe_blocks)]
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use rand::Rng; use rand::Rng;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View File

@@ -2,7 +2,6 @@
//! make sure that we use the same dep version everywhere. //! make sure that we use the same dep version everywhere.
//! Otherwise, we might not see all metrics registered via //! Otherwise, we might not see all metrics registered via
//! a default registry. //! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts; pub use prometheus::opts;
@@ -90,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
]; ];
pub fn set_build_info_metric(revision: &str, build_tag: &str) { pub fn set_build_info_metric(revision: &str) {
let metric = register_int_gauge_vec!( let metric = register_int_gauge_vec!(
"libmetrics_build_info", "libmetrics_build_info",
"Build/version information", "Build/version information",
&["revision", "build_tag"] &["revision"]
) )
.expect("Failed to register build info metric"); .expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1); metric.with_label_values(&[revision]).set(1);
} }
// Records I/O stats in a "cross-platform" way. // Records I/O stats in a "cross-platform" way.

View File

@@ -4,6 +4,7 @@
//! See docs/rfcs/025-generation-numbers.md //! See docs/rfcs/025-generation-numbers.md
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId}; use utils::id::{NodeId, TenantId};
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@@ -11,10 +12,12 @@ pub struct ReAttachRequest {
pub node_id: NodeId, pub node_id: NodeId,
} }
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant { pub struct ReAttachResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId, pub id: TenantId,
pub gen: u32, pub generation: u32,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@@ -22,8 +25,10 @@ pub struct ReAttachResponse {
pub tenants: Vec<ReAttachResponseTenant>, pub tenants: Vec<ReAttachResponseTenant>,
} }
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct ValidateRequestTenant { pub struct ValidateRequestTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId, pub id: TenantId,
pub gen: u32, pub gen: u32,
} }
@@ -38,8 +43,10 @@ pub struct ValidateResponse {
pub tenants: Vec<ValidateResponseTenant>, pub tenants: Vec<ValidateResponseTenant>,
} }
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct ValidateResponseTenant { pub struct ValidateResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId, pub id: TenantId,
pub valid: bool, pub valid: bool,
} }

View File

@@ -1,5 +1,3 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp; use const_format::formatcp;
/// Public API types /// Public API types

View File

@@ -6,7 +6,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt}; use byteorder::{BigEndian, ReadBytesExt};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::serde_as; use serde_with::{serde_as, DisplayFromStr};
use strum_macros; use strum_macros;
use utils::{ use utils::{
completion, completion,
@@ -110,6 +110,7 @@ impl TenantState {
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
// tenant mgr startup distinguishes attaching from loading via marker file. // tenant mgr startup distinguishes attaching from loading via marker file.
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
// We only reach Active after successful load / attach. // We only reach Active after successful load / attach.
// So, call atttachment status Attached. // So, call atttachment status Attached.
@@ -174,19 +175,25 @@ pub enum TimelineState {
Broken { reason: String, backtrace: String }, Broken { reason: String, backtrace: String },
} }
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest { pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_timeline_id: TimelineId, pub new_timeline_id: TimelineId,
#[serde(default)] #[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>, pub ancestor_timeline_id: Option<TimelineId>,
#[serde(default)] #[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_start_lsn: Option<Lsn>, pub ancestor_start_lsn: Option<Lsn>,
pub pg_version: Option<u32>, pub pg_version: Option<u32>,
} }
#[serde_as]
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
pub struct TenantCreateRequest { pub struct TenantCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_tenant_id: TenantId, pub new_tenant_id: TenantId,
#[serde(default)] #[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
@@ -195,6 +202,7 @@ pub struct TenantCreateRequest {
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
} }
#[serde_as]
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
pub struct TenantLoadRequest { pub struct TenantLoadRequest {
@@ -271,26 +279,31 @@ pub struct LocationConfig {
pub tenant_conf: TenantConfig, pub tenant_conf: TenantConfig,
} }
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
#[serde(transparent)] #[serde(transparent)]
pub struct TenantCreateResponse(pub TenantId); pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
#[derive(Serialize)] #[derive(Serialize)]
pub struct StatusResponse { pub struct StatusResponse {
pub id: NodeId, pub id: NodeId,
} }
#[serde_as]
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest { pub struct TenantLocationConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
#[serde(flatten)] #[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
} }
#[serde_as]
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
pub struct TenantConfigRequest { pub struct TenantConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
#[serde(flatten)] #[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -362,8 +375,10 @@ pub enum TenantAttachmentStatus {
Failed { reason: String }, Failed { reason: String },
} }
#[serde_as]
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo { pub struct TenantInfo {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId, pub id: TenantId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState, pub state: TenantState,
@@ -374,22 +389,33 @@ pub struct TenantInfo {
} }
/// This represents the output of the "timeline_detail" and "timeline_list" API calls. /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo { pub struct TimelineInfo {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId, pub timeline_id: TimelineId,
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>, pub ancestor_timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_lsn: Option<Lsn>, pub ancestor_lsn: Option<Lsn>,
#[serde_as(as = "DisplayFromStr")]
pub last_record_lsn: Lsn, pub last_record_lsn: Lsn,
#[serde_as(as = "Option<DisplayFromStr>")]
pub prev_record_lsn: Option<Lsn>, pub prev_record_lsn: Option<Lsn>,
#[serde_as(as = "DisplayFromStr")]
pub latest_gc_cutoff_lsn: Lsn, pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn, pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage /// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn, pub remote_consistent_lsn: Lsn,
/// The LSN that we are advertizing to safekeepers /// The LSN that we are advertizing to safekeepers
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn_visible: Lsn, pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -401,6 +427,7 @@ pub struct TimelineInfo {
pub timeline_dir_layer_file_size_sum: Option<u64>, pub timeline_dir_layer_file_size_sum: Option<u64>,
pub wal_source_connstr: Option<String>, pub wal_source_connstr: Option<String>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub last_received_msg_lsn: Option<Lsn>, pub last_received_msg_lsn: Option<Lsn>,
/// the timestamp (in microseconds) of the last received message /// the timestamp (in microseconds) of the last received message
pub last_received_msg_ts: Option<u128>, pub last_received_msg_ts: Option<u128>,
@@ -497,13 +524,23 @@ pub struct LayerAccessStats {
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>, pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
} }
#[serde_as]
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
#[serde(tag = "kind")] #[serde(tag = "kind")]
pub enum InMemoryLayerInfo { pub enum InMemoryLayerInfo {
Open { lsn_start: Lsn }, Open {
Frozen { lsn_start: Lsn, lsn_end: Lsn }, #[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
},
Frozen {
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
#[serde_as(as = "DisplayFromStr")]
lsn_end: Lsn,
},
} }
#[serde_as]
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
#[serde(tag = "kind")] #[serde(tag = "kind")]
pub enum HistoricLayerInfo { pub enum HistoricLayerInfo {
@@ -511,7 +548,9 @@ pub enum HistoricLayerInfo {
layer_file_name: String, layer_file_name: String,
layer_file_size: u64, layer_file_size: u64,
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn, lsn_start: Lsn,
#[serde_as(as = "DisplayFromStr")]
lsn_end: Lsn, lsn_end: Lsn,
remote: bool, remote: bool,
access_stats: LayerAccessStats, access_stats: LayerAccessStats,
@@ -520,6 +559,7 @@ pub enum HistoricLayerInfo {
layer_file_name: String, layer_file_name: String,
layer_file_size: u64, layer_file_size: u64,
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn, lsn_start: Lsn,
remote: bool, remote: bool,
access_stats: LayerAccessStats, access_stats: LayerAccessStats,

View File

@@ -2,8 +2,6 @@
//! To use, create PostgresBackend and run() it, passing the Handler //! To use, create PostgresBackend and run() it, passing the Handler
//! implementation determining how to process the queries. Currently its API //! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required. //! is rather narrow, but we can extend it once required.
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::Context; use anyhow::Context;
use bytes::Bytes; use bytes::Bytes;
use futures::pin_mut; use futures::pin_mut;
@@ -17,7 +15,7 @@ use std::{fmt, io};
use std::{future::Future, str::FromStr}; use std::{future::Future, str::FromStr};
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor; use tokio_rustls::TlsAcceptor;
use tracing::{debug, error, info, trace, warn}; use tracing::{debug, error, info, trace};
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
use pq_proto::{ use pq_proto::{
@@ -35,11 +33,6 @@ pub enum QueryError {
/// We were instructed to shutdown while processing the query /// We were instructed to shutdown while processing the query
#[error("Shutting down")] #[error("Shutting down")]
Shutdown, Shutdown,
/// Authentication failure
#[error("Unauthorized: {0}")]
Unauthorized(std::borrow::Cow<'static, str>),
#[error("Simulated Connection Error")]
SimulatedConnectionError,
/// Some other error /// Some other error
#[error(transparent)] #[error(transparent)]
Other(#[from] anyhow::Error), Other(#[from] anyhow::Error),
@@ -54,9 +47,8 @@ impl From<io::Error> for QueryError {
impl QueryError { impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] { pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self { match self {
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure Self::Disconnected(_) => b"08006", // connection failure
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN, Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
} }
} }
@@ -250,7 +242,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
} }
} }
/// Cancellation safe as long as the underlying IO is cancellation safe.
async fn shutdown(&mut self) -> io::Result<()> { async fn shutdown(&mut self) -> io::Result<()> {
match self { match self {
MaybeWriteOnly::Full(framed) => framed.shutdown().await, MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -402,23 +393,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
shutdown_watcher: F, shutdown_watcher: F,
) -> Result<(), QueryError> ) -> Result<(), QueryError>
where where
F: Fn() -> S + Clone, F: Fn() -> S,
S: Future, S: Future,
{ {
let ret = self let ret = self.run_message_loop(handler, shutdown_watcher).await;
.run_message_loop(handler, shutdown_watcher.clone()) // socket might be already closed, e.g. if previously received error,
.await; // so ignore result.
self.framed.shutdown().await.ok();
tokio::select! {
_ = shutdown_watcher() => {
// do nothing; we most likely got already stopped by shutdown and will log it next.
}
_ = self.framed.shutdown() => {
// socket might be already closed, e.g. if previously received error,
// so ignore result.
},
}
match ret { match ret {
Ok(()) => Ok(()), Ok(()) => Ok(()),
Err(QueryError::Shutdown) => { Err(QueryError::Shutdown) => {
@@ -616,7 +597,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
if let Err(e) = handler.check_auth_jwt(self, jwt_response) { if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message_noflush(&BeMessage::ErrorResponse( self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error(&e), &e.to_string(),
Some(e.pg_error_code()), Some(e.pg_error_code()),
))?; ))?;
return Err(e); return Err(e);
@@ -736,20 +717,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
trace!("got query {query_string:?}"); trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await { if let Err(e) = handler.process_query(self, query_string).await {
match e { log_query_error(query_string, &e);
QueryError::Shutdown => return Ok(ProcessMsgResult::Break), let short_error = short_error(&e);
QueryError::SimulatedConnectionError => { self.write_message_noflush(&BeMessage::ErrorResponse(
return Err(QueryError::SimulatedConnectionError) &short_error,
} Some(e.pg_error_code()),
e => { ))?;
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
}
} }
self.write_message_noflush(&BeMessage::ReadyForQuery)?; self.write_message_noflush(&BeMessage::ReadyForQuery)?;
} }
@@ -975,8 +948,6 @@ pub fn short_error(e: &QueryError) -> String {
match e { match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(), QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Shutdown => "shutdown".to_string(), QueryError::Shutdown => "shutdown".to_string(),
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
QueryError::Other(e) => format!("{e:#}"), QueryError::Other(e) => format!("{e:#}"),
} }
} }
@@ -993,15 +964,9 @@ fn log_query_error(query: &str, e: &QueryError) {
QueryError::Disconnected(other_connection_error) => { QueryError::Disconnected(other_connection_error) => {
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
} }
QueryError::SimulatedConnectionError => {
error!("query handler for query '{query}' failed due to a simulated connection error")
}
QueryError::Shutdown => { QueryError::Shutdown => {
info!("query handler for '{query}' cancelled during tenant shutdown") info!("query handler for '{query}' cancelled during tenant shutdown")
} }
QueryError::Unauthorized(e) => {
warn!("query handler for '{query}' failed with authentication error: {e}");
}
QueryError::Other(e) => { QueryError::Other(e) => {
error!("query handler for '{query}' failed: {e:?}"); error!("query handler for '{query}' failed: {e:?}");
} }

View File

@@ -1,5 +1,3 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use itertools::Itertools; use itertools::Itertools;
use std::borrow::Cow; use std::borrow::Cow;

View File

@@ -8,7 +8,6 @@
// modules included with the postgres_ffi macro depend on the types of the specific version's // modules included with the postgres_ffi macro depend on the types of the specific version's
// types, and trigger a too eager lint. // types, and trigger a too eager lint.
#![allow(clippy::duplicate_mod)] #![allow(clippy::duplicate_mod)]
#![deny(clippy::undocumented_unsafe_blocks)]
use bytes::Bytes; use bytes::Bytes;
use utils::bin_ser::SerializeError; use utils::bin_ser::SerializeError;
@@ -21,7 +20,6 @@ macro_rules! postgres_ffi {
pub mod bindings { pub mod bindings {
// bindgen generates bindings for a lot of stuff we don't need // bindgen generates bindings for a lot of stuff we don't need
#![allow(dead_code)] #![allow(dead_code)]
#![allow(clippy::undocumented_unsafe_blocks)]
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
include!(concat!( include!(concat!(

View File

@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
($version:ident) => { ($version:ident) => {
#[path = "."] #[path = "."]
mod $version { mod $version {
#[allow(unused_imports)]
pub use postgres_ffi::$version::wal_craft_test_export::*; pub use postgres_ffi::$version::wal_craft_test_export::*;
#[allow(clippy::duplicate_mod)] #[allow(clippy::duplicate_mod)]
#[cfg(test)] #[cfg(test)]

View File

@@ -214,24 +214,27 @@ where
} }
} }
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
async fn flush<S: AsyncWrite + Unpin>( async fn flush<S: AsyncWrite + Unpin>(
stream: &mut S, stream: &mut S,
write_buf: &mut BytesMut, write_buf: &mut BytesMut,
) -> Result<(), io::Error> { ) -> Result<(), io::Error> {
while write_buf.has_remaining() { while write_buf.has_remaining() {
let bytes_written = stream.write_buf(write_buf).await?; let bytes_written = stream.write(write_buf.chunk()).await?;
if bytes_written == 0 { if bytes_written == 0 {
return Err(io::Error::new( return Err(io::Error::new(
ErrorKind::WriteZero, ErrorKind::WriteZero,
"failed to write message", "failed to write message",
)); ));
} }
// The advanced part will be garbage collected, likely during shifting
// data left on next attempt to write to buffer when free space is not
// enough.
write_buf.advance(bytes_written);
} }
write_buf.clear();
stream.flush().await stream.flush().await
} }
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
async fn shutdown<S: AsyncWrite + Unpin>( async fn shutdown<S: AsyncWrite + Unpin>(
stream: &mut S, stream: &mut S,
write_buf: &mut BytesMut, write_buf: &mut BytesMut,

View File

@@ -1,7 +1,6 @@
//! Postgres protocol messages serialization-deserialization. See //! Postgres protocol messages serialization-deserialization. See
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html> //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
//! on message formats. //! on message formats.
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod framed; pub mod framed;

View File

@@ -8,7 +8,6 @@ license.workspace = true
anyhow.workspace = true anyhow.workspace = true
async-trait.workspace = true async-trait.workspace = true
once_cell.workspace = true once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-http.workspace = true aws-smithy-http.workspace = true
aws-types.workspace = true aws-types.workspace = true
aws-config.workspace = true aws-config.workspace = true

View File

@@ -1,18 +1,21 @@
//! Azure Blob Storage wrapper //! Azure Blob Storage wrapper
use std::collections::HashMap;
use std::env; use std::env;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::sync::Arc; use std::sync::Arc;
use std::{borrow::Cow, io::Cursor}; use std::{borrow::Cow, collections::HashMap, io::Cursor};
use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result; use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range}; use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::Header;
use azure_identity::DefaultAzureCredential; use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials; use azure_storage::StorageCredentials;
use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use azure_storage_blobs::{
blob::operations::GetBlobBuilder,
prelude::{BlobClient, ContainerClient},
};
use futures_util::StreamExt; use futures_util::StreamExt;
use http_types::StatusCode; use http_types::StatusCode;
use tokio::io::AsyncRead; use tokio::io::AsyncRead;
@@ -20,8 +23,8 @@ use tracing::debug;
use crate::s3_bucket::RequestKind; use crate::s3_bucket::RequestKind;
use crate::{ use crate::{
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
RemoteStorage, StorageMetadata, StorageMetadata,
}; };
pub struct AzureBlobStorage { pub struct AzureBlobStorage {
@@ -109,19 +112,16 @@ impl AzureBlobStorage {
async fn download_for_builder( async fn download_for_builder(
&self, &self,
metadata: StorageMetadata,
builder: GetBlobBuilder, builder: GetBlobBuilder,
) -> Result<Download, DownloadError> { ) -> Result<Download, DownloadError> {
let mut response = builder.into_stream(); let mut response = builder.into_stream();
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM // TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563 // https://github.com/neondatabase/neon/issues/5563
let mut buf = Vec::new(); let mut buf = Vec::new();
while let Some(part) = response.next().await { while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?; let part = part.map_err(to_download_error)?;
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part let data = part
.data .data
.collect() .collect()
@@ -131,9 +131,28 @@ impl AzureBlobStorage {
} }
Ok(Download { Ok(Download {
download_stream: Box::pin(Cursor::new(buf)), download_stream: Box::pin(Cursor::new(buf)),
metadata: Some(StorageMetadata(metadata)), metadata: Some(metadata),
}) })
} }
// TODO get rid of this function once we have metadata included in the response
// https://github.com/Azure/azure-sdk-for-rust/issues/1439
async fn get_metadata(
&self,
blob_client: &BlobClient,
) -> Result<StorageMetadata, DownloadError> {
let builder = blob_client.get_metadata();
let response = builder.into_future().await.map_err(to_download_error)?;
let mut map = HashMap::new();
for md in response.metadata.iter() {
map.insert(
md.name().as_str().to_string(),
md.value().as_str().to_string(),
);
}
Ok(StorageMetadata(map))
}
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
self.concurrency_limiter self.concurrency_limiter
@@ -165,11 +184,10 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
#[async_trait::async_trait] #[async_trait::async_trait]
impl RemoteStorage for AzureBlobStorage { impl RemoteStorage for AzureBlobStorage {
async fn list( async fn list_prefixes(
&self, &self,
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
mode: ListingMode, ) -> Result<Vec<RemotePath>, DownloadError> {
) -> anyhow::Result<Listing, DownloadError> {
// get the passed prefix or if it is not set use prefix_in_bucket value // get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix let list_prefix = prefix
.map(|p| self.relative_path_to_name(p)) .map(|p| self.relative_path_to_name(p))
@@ -177,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
.map(|mut p| { .map(|mut p| {
// required to end with a separator // required to end with a separator
// otherwise request will return only the entry of a prefix // otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter) if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
} }
p p
}); });
let mut builder = self.client.list_blobs(); let mut builder = self
.client
if let ListingMode::WithDelimiter = mode { .list_blobs()
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
if let Some(prefix) = list_prefix { if let Some(prefix) = list_prefix {
builder = builder.prefix(Cow::from(prefix.to_owned())); builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -200,23 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
} }
let mut response = builder.into_stream(); let mut response = builder.into_stream();
let mut res = Listing::default(); let mut res = Vec::new();
while let Some(l) = response.next().await { while let Some(entry) = response.next().await {
let entry = l.map_err(to_download_error)?; let entry = entry.map_err(to_download_error)?;
let prefix_iter = entry let name_iter = entry
.blobs .blobs
.prefixes() .prefixes()
.map(|prefix| self.name_to_relative_path(&prefix.name)); .map(|prefix| self.name_to_relative_path(&prefix.name));
res.prefixes.extend(prefix_iter); res.extend(name_iter);
let blob_iter = entry
.blobs
.blobs()
.map(|k| self.name_to_relative_path(&k.name));
res.keys.extend(blob_iter);
} }
Ok(res) Ok(res)
} }
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let folder_name = folder
.map(|p| self.relative_path_to_name(p))
.or_else(|| self.prefix_in_container.clone());
let mut builder = self.client.list_blobs();
if let Some(folder_name) = folder_name {
builder = builder.prefix(Cow::from(folder_name.to_owned()));
}
if let Some(limit) = self.max_keys_per_list_response {
builder = builder.max_results(MaxResults::new(limit));
}
let mut response = builder.into_stream();
let mut res = Vec::new();
while let Some(l) = response.next().await {
let entry = l.map_err(anyhow::Error::new)?;
let name_iter = entry
.blobs
.blobs()
.map(|bl| self.name_to_relative_path(&bl.name));
res.extend(name_iter);
}
Ok(res)
}
async fn upload( async fn upload(
&self, &self,
mut from: impl AsyncRead + Unpin + Send + Sync + 'static, mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
@@ -250,9 +288,11 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Get).await; let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let metadata = self.get_metadata(&blob_client).await?;
let builder = blob_client.get(); let builder = blob_client.get();
self.download_for_builder(builder).await self.download_for_builder(metadata, builder).await
} }
async fn download_byte_range( async fn download_byte_range(
@@ -264,6 +304,8 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Get).await; let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let metadata = self.get_metadata(&blob_client).await?;
let mut builder = blob_client.get(); let mut builder = blob_client.get();
if let Some(end_exclusive) = end_exclusive { if let Some(end_exclusive) = end_exclusive {
@@ -278,7 +320,7 @@ impl RemoteStorage for AzureBlobStorage {
builder = builder.range(Range::new(start_inclusive, end_exclusive)); builder = builder.range(Range::new(start_inclusive, end_exclusive));
} }
self.download_for_builder(builder).await self.download_for_builder(metadata, builder).await
} }
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {

View File

@@ -6,15 +6,19 @@
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage //! * [`s3_bucket`] uses AWS S3 bucket as an external storage
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage //! * [`azure_blob`] allows to use Azure Blob storage as an external storage
//! //!
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
mod azure_blob; mod azure_blob;
mod local_fs; mod local_fs;
mod s3_bucket; mod s3_bucket;
mod simulate_failures; mod simulate_failures;
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc}; use std::{
collections::HashMap,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
sync::Arc,
};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
@@ -30,6 +34,12 @@ pub use self::{
}; };
use s3_bucket::RequestKind; use s3_bucket::RequestKind;
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// Currently, sync happens with AWS S3, that has two limits on requests per second: /// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services /// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html> /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -119,22 +129,6 @@ impl RemotePath {
} }
} }
/// We don't need callers to be able to pass arbitrary delimiters: just control
/// whether listings will use a '/' separator or not.
///
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
/// NoDelimiter mode will only populate `keys`.
pub enum ListingMode {
WithDelimiter,
NoDelimiter,
}
#[derive(Default)]
pub struct Listing {
pub prefixes: Vec<RemotePath>,
pub keys: Vec<RemotePath>,
}
/// Storage (potentially remote) API to manage its state. /// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context, /// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files. /// providing basic CRUD operations for storage files.
@@ -147,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn list_prefixes( async fn list_prefixes(
&self, &self,
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> { ) -> Result<Vec<RemotePath>, DownloadError>;
let result = self
.list(prefix, ListingMode::WithDelimiter)
.await?
.prefixes;
Ok(result)
}
/// Lists all files in directory "recursively" /// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace) /// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes, /// Note: This is subtely different than list_prefixes,
@@ -165,16 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// whereas, /// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"] /// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details. /// See `test_real_s3.rs` for more details.
async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> { async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
Ok(result)
}
async fn list(
&self,
prefix: Option<&RemotePath>,
_mode: ListingMode,
) -> anyhow::Result<Listing, DownloadError>;
/// Streams the local file contents into remote into the remote storage entry. /// Streams the local file contents into remote into the remote storage entry.
async fn upload( async fn upload(
@@ -225,9 +205,6 @@ pub enum DownloadError {
BadInput(anyhow::Error), BadInput(anyhow::Error),
/// The file was not found in the remote storage. /// The file was not found in the remote storage.
NotFound, NotFound,
/// A cancellation token aborted the download, typically during
/// tenant detach or process shutdown.
Cancelled,
/// The file was found in the remote storage, but the download failed. /// The file was found in the remote storage, but the download failed.
Other(anyhow::Error), Other(anyhow::Error),
} }
@@ -238,7 +215,6 @@ impl std::fmt::Display for DownloadError {
DownloadError::BadInput(e) => { DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}") write!(f, "Failed to download a remote file due to user input: {e}")
} }
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
DownloadError::NotFound => write!(f, "No file found for the remote object id given"), DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
} }
@@ -258,19 +234,6 @@ pub enum GenericRemoteStorage {
} }
impl GenericRemoteStorage { impl GenericRemoteStorage {
pub async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
) -> anyhow::Result<Listing, DownloadError> {
match self {
Self::LocalFs(s) => s.list(prefix, mode).await,
Self::AwsS3(s) => s.list(prefix, mode).await,
Self::AzureBlob(s) => s.list(prefix, mode).await,
Self::Unreliable(s) => s.list(prefix, mode).await,
}
}
// A function for listing all the files in a "directory" // A function for listing all the files in a "directory"
// Example: // Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
@@ -431,6 +394,10 @@ pub struct StorageMetadata(HashMap<String, String>);
/// External backup storage configuration, enough for creating a client for that storage. /// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig { pub struct RemoteStorageConfig {
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
pub max_concurrent_syncs: NonZeroUsize,
/// Max allowed errors before the sync task is considered failed and evicted.
pub max_sync_errors: NonZeroU32,
/// The storage connection configuration. /// The storage connection configuration.
pub storage: RemoteStorageKind, pub storage: RemoteStorageKind,
} }
@@ -526,6 +493,18 @@ impl RemoteStorageConfig {
let use_azure = container_name.is_some() && container_region.is_some(); let use_azure = container_name.is_some() && container_region.is_some();
let max_concurrent_syncs = NonZeroUsize::new(
parse_optional_integer("max_concurrent_syncs", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
)
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
let max_sync_errors = NonZeroU32::new(
parse_optional_integer("max_sync_errors", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
)
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
let default_concurrency_limit = if use_azure { let default_concurrency_limit = if use_azure {
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
} else { } else {
@@ -607,7 +586,11 @@ impl RemoteStorageConfig {
} }
}; };
Ok(Some(RemoteStorageConfig { storage })) Ok(Some(RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage,
}))
} }
} }

View File

@@ -15,7 +15,7 @@ use tokio::{
use tracing::*; use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath}; use crate::{Download, DownloadError, RemotePath};
use super::{RemoteStorage, StorageMetadata}; use super::{RemoteStorage, StorageMetadata};
@@ -75,7 +75,7 @@ impl LocalFs {
} }
#[cfg(test)] #[cfg(test)]
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> { async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
Ok(get_all_files(&self.storage_root, true) Ok(get_all_files(&self.storage_root, true)
.await? .await?
.into_iter() .into_iter()
@@ -89,10 +89,52 @@ impl LocalFs {
}) })
.collect()) .collect())
} }
}
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
prefixes.push(
prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
),
)
}
Ok(prefixes)
}
// recursively lists all files in a directory, // recursively lists all files in a directory,
// mirroring the `list_files` for `s3_bucket` // mirroring the `list_files` for `s3_bucket`
async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> { async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let full_path = match folder { let full_path = match folder {
Some(folder) => folder.with_base(&self.storage_root), Some(folder) => folder.with_base(&self.storage_root),
None => self.storage_root.clone(), None => self.storage_root.clone(),
@@ -144,70 +186,6 @@ impl LocalFs {
Ok(files) Ok(files)
} }
}
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
) -> Result<Listing, DownloadError> {
let mut result = Listing::default();
if let ListingMode::NoDelimiter = mode {
let keys = self
.list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
let stripped = prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
);
if prefix.is_dir() {
result.prefixes.push(stripped);
} else {
result.keys.push(stripped);
}
}
Ok(result)
}
async fn upload( async fn upload(
&self, &self,
@@ -501,7 +479,7 @@ mod fs_tests {
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
assert_eq!( assert_eq!(
storage.list_all().await?, storage.list().await?,
vec![target_path_1.clone()], vec![target_path_1.clone()],
"Should list a single file after first upload" "Should list a single file after first upload"
); );
@@ -689,7 +667,7 @@ mod fs_tests {
let upload_target = upload_dummy_file(&storage, upload_name, None).await?; let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
storage.delete(&upload_target).await?; storage.delete(&upload_target).await?;
assert!(storage.list_all().await?.is_empty()); assert!(storage.list().await?.is_empty());
storage storage
.delete(&upload_target) .delete(&upload_target)
@@ -747,43 +725,6 @@ mod fs_tests {
Ok(()) Ok(())
} }
#[tokio::test]
async fn list() -> anyhow::Result<()> {
// No delimiter: should recursively list everything
let storage = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
let listing = storage.list(None, ListingMode::NoDelimiter).await?;
assert!(listing.prefixes.is_empty());
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
// Delimiter: should only go one deep
let listing = storage.list(None, ListingMode::WithDelimiter).await?;
assert_eq!(
listing.prefixes,
[RemotePath::from_string("timelines").unwrap()].to_vec()
);
assert!(listing.keys.is_empty());
// Delimiter & prefix
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
ListingMode::WithDelimiter,
)
.await?;
assert_eq!(
listing.prefixes,
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
.to_vec()
);
assert_eq!(listing.keys, [uncle.clone()].to_vec());
Ok(())
}
async fn upload_dummy_file( async fn upload_dummy_file(
storage: &LocalFs, storage: &LocalFs,
name: &str, name: &str,
@@ -836,7 +777,7 @@ mod fs_tests {
} }
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> { async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
let mut files = storage.list_all().await?; let mut files = storage.list().await?;
files.sort_by(|a, b| a.0.cmp(&b.0)); files.sort_by(|a, b| a.0.cmp(&b.0));
Ok(files) Ok(files)
} }

View File

@@ -4,27 +4,23 @@
//! allowing multiple api users to independently work with the same S3 bucket, if //! allowing multiple api users to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different. //! their bucket prefixes are both specified and different.
use std::{borrow::Cow, sync::Arc}; use std::borrow::Cow;
use anyhow::Context; use anyhow::Context;
use aws_config::{ use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider, environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
meta::credentials::CredentialsProviderChain, provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
}; };
use aws_credential_types::cache::CredentialsCache; use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{ use aws_sdk_s3::{
config::{AsyncSleep, Config, Region, SharedAsyncSleep}, config::{Config, Region},
error::SdkError, error::SdkError,
operation::get_object::GetObjectError, operation::get_object::GetObjectError,
primitives::ByteStream, primitives::ByteStream,
types::{Delete, ObjectIdentifier}, types::{Delete, ObjectIdentifier},
Client, Client,
}; };
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_http::body::SdkBody; use aws_smithy_http::body::SdkBody;
use hyper::Body; use hyper::Body;
use scopeguard::ScopeGuard; use scopeguard::ScopeGuard;
@@ -34,8 +30,8 @@ use tracing::debug;
use super::StorageMetadata; use super::StorageMetadata;
use crate::{ use crate::{
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
}; };
pub(super) mod metrics; pub(super) mod metrics;
@@ -87,23 +83,10 @@ impl S3Bucket {
.or_else("imds", ImdsCredentialsProvider::builder().build()) .or_else("imds", ImdsCredentialsProvider::builder().build())
}; };
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
let mut retry_config = RetryConfigBuilder::new();
retry_config
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
let mut config_builder = Config::builder() let mut config_builder = Config::builder()
.region(region) .region(region)
.credentials_cache(CredentialsCache::lazy()) .credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider) .credentials_provider(credentials_provider);
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.retry_config(retry_config.build());
if let Some(custom_endpoint) = aws_config.endpoint.clone() { if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder config_builder = config_builder
@@ -316,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
#[async_trait::async_trait] #[async_trait::async_trait]
impl RemoteStorage for S3Bucket { impl RemoteStorage for S3Bucket {
async fn list( /// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(
&self, &self,
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
mode: ListingMode, ) -> Result<Vec<RemotePath>, DownloadError> {
) -> Result<Listing, DownloadError> {
let kind = RequestKind::List; let kind = RequestKind::List;
let mut result = Listing::default();
// get the passed prefix or if it is not set use prefix_in_bucket value // get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix let list_prefix = prefix
@@ -331,33 +314,28 @@ impl RemoteStorage for S3Bucket {
.map(|mut p| { .map(|mut p| {
// required to end with a separator // required to end with a separator
// otherwise request will return only the entry of a prefix // otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter) if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
} }
p p
}); });
let mut document_keys = Vec::new();
let mut continuation_token = None; let mut continuation_token = None;
loop { loop {
let _guard = self.permit(kind).await; let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind); let started_at = start_measuring_requests(kind);
let mut request = self let fetch_response = self
.client .client
.list_objects_v2() .list_objects_v2()
.bucket(self.bucket_name.clone()) .bucket(self.bucket_name.clone())
.set_prefix(list_prefix.clone()) .set_prefix(list_prefix.clone())
.set_continuation_token(continuation_token) .set_continuation_token(continuation_token)
.set_max_keys(self.max_keys_per_list_response); .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
.set_max_keys(self.max_keys_per_list_response)
if let ListingMode::WithDelimiter = mode {
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let response = request
.send() .send()
.await .await
.context("Failed to list S3 prefixes") .context("Failed to list S3 prefixes")
@@ -367,35 +345,71 @@ impl RemoteStorage for S3Bucket {
metrics::BUCKET_METRICS metrics::BUCKET_METRICS
.req_seconds .req_seconds
.observe_elapsed(kind, &response, started_at); .observe_elapsed(kind, &fetch_response, started_at);
let response = response?; let fetch_response = fetch_response?;
let keys = response.contents().unwrap_or_default(); document_keys.extend(
let empty = Vec::new(); fetch_response
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); .common_prefixes
.unwrap_or_default()
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); .into_iter()
for object in keys {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
result.keys.push(remote_path);
}
result.prefixes.extend(
prefixes
.iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
); );
continuation_token = match response.next_continuation_token { continuation_token = match fetch_response.next_continuation_token {
Some(new_token) => Some(new_token), Some(new_token) => Some(new_token),
None => break, None => break,
}; };
} }
Ok(result) Ok(document_keys)
}
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let kind = RequestKind::List;
let folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
// AWS may need to break the response into several parts
let mut continuation_token = None;
let mut all_files = vec![];
loop {
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
let response = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(folder_name.clone())
.set_continuation_token(continuation_token)
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("Failed to list files in S3 bucket");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
let response = response?;
for object in response.contents().unwrap_or_default() {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
all_files.push(remote_path);
}
match response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(all_files)
} }
async fn upload( async fn upload(

View File

@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Mutex; use std::sync::Mutex;
use crate::{ use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
};
pub struct UnreliableWrapper { pub struct UnreliableWrapper {
inner: crate::GenericRemoteStorage, inner: crate::GenericRemoteStorage,
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.list_files(folder).await self.inner.list_files(folder).await
} }
async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
) -> Result<Listing, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
self.inner.list(prefix, mode).await
}
async fn upload( async fn upload(
&self, &self,
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::env; use std::env;
use std::num::NonZeroUsize; use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow; use std::ops::ControlFlow;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
@@ -469,6 +469,8 @@ fn create_azure_client(
let random = rand::thread_rng().gen::<u32>(); let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig { let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AzureContainer(AzureConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig {
container_name: remote_storage_azure_container, container_name: remote_storage_azure_container,
container_region: remote_storage_azure_region, container_region: remote_storage_azure_region,

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::env; use std::env;
use std::num::NonZeroUsize; use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow; use std::ops::ControlFlow;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
@@ -396,6 +396,8 @@ fn create_s3_client(
let random = rand::thread_rng().gen::<u32>(); let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig { let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config { storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket, bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region, bucket_region: remote_storage_s3_region,

View File

@@ -1,5 +1,3 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp; use const_format::formatcp;
/// Public API types /// Public API types

View File

@@ -1,18 +1,23 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::{ use utils::{
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
}; };
#[serde_as]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest { pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId, pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId, pub timeline_id: TimelineId,
pub peer_ids: Option<Vec<NodeId>>, pub peer_ids: Option<Vec<NodeId>>,
pub pg_version: u32, pub pg_version: u32,
pub system_id: Option<u64>, pub system_id: Option<u64>,
pub wal_seg_size: Option<u32>, pub wal_seg_size: Option<u32>,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn, pub commit_lsn: Lsn,
// If not passed, it is assigned to the beginning of commit_lsn segment. // If not passed, it is assigned to the beginning of commit_lsn segment.
pub local_start_lsn: Option<Lsn>, pub local_start_lsn: Option<Lsn>,
@@ -23,6 +28,7 @@ fn lsn_invalid() -> Lsn {
} }
/// Data about safekeeper's timeline, mirrors broker.proto. /// Data about safekeeper's timeline, mirrors broker.proto.
#[serde_as]
#[derive(Debug, Clone, Deserialize, Serialize)] #[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SkTimelineInfo { pub struct SkTimelineInfo {
/// Term. /// Term.
@@ -30,19 +36,25 @@ pub struct SkTimelineInfo {
/// Term of the last entry. /// Term of the last entry.
pub last_log_term: Option<u64>, pub last_log_term: Option<u64>,
/// LSN of the last record. /// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub flush_lsn: Lsn, pub flush_lsn: Lsn,
/// Up to which LSN safekeeper regards its WAL as committed. /// Up to which LSN safekeeper regards its WAL as committed.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub commit_lsn: Lsn, pub commit_lsn: Lsn,
/// LSN up to which safekeeper has backed WAL. /// LSN up to which safekeeper has backed WAL.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub backup_lsn: Lsn, pub backup_lsn: Lsn,
/// LSN of last checkpoint uploaded by pageserver. /// LSN of last checkpoint uploaded by pageserver.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub remote_consistent_lsn: Lsn, pub remote_consistent_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub peer_horizon_lsn: Lsn, pub peer_horizon_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")] #[serde(default = "lsn_invalid")]
pub local_start_lsn: Lsn, pub local_start_lsn: Lsn,
/// A connection string to use for WAL receiving. /// A connection string to use for WAL receiving.

View File

@@ -1,6 +1,4 @@
//! Synthetic size calculation //! Synthetic size calculation
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
mod calculation; mod calculation;
pub mod svg; pub mod svg;

View File

@@ -32,8 +32,6 @@
//! .init(); //! .init();
//! } //! }
//! ``` //! ```
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use opentelemetry::sdk::Resource; use opentelemetry::sdk::Resource;
use opentelemetry::KeyValue; use opentelemetry::KeyValue;

View File

@@ -5,7 +5,6 @@ edition.workspace = true
license.workspace = true license.workspace = true
[dependencies] [dependencies]
arc-swap.workspace = true
sentry.workspace = true sentry.workspace = true
async-trait.workspace = true async-trait.workspace = true
anyhow.workspace = true anyhow.workspace = true
@@ -56,7 +55,6 @@ bytes.workspace = true
criterion.workspace = true criterion.workspace = true
hex-literal.workspace = true hex-literal.workspace = true
camino-tempfile.workspace = true camino-tempfile.workspace = true
serde_assert.workspace = true
[[bench]] [[bench]]
name = "benchmarks" name = "benchmarks"

View File

@@ -1,8 +1,7 @@
// For details about authentication see docs/authentication.md // For details about authentication see docs/authentication.md
use arc_swap::ArcSwap;
use serde; use serde;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use std::fs;
use anyhow::Result; use anyhow::Result;
use camino::Utf8Path; use camino::Utf8Path;
@@ -10,8 +9,9 @@ use jsonwebtoken::{
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
}; };
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use crate::{http::error::ApiError, id::TenantId}; use crate::id::TenantId;
/// Algorithm to use. We require EdDSA. /// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -32,9 +32,11 @@ pub enum Scope {
} }
/// JWT payload. See docs/authentication.md for the format /// JWT payload. See docs/authentication.md for the format
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
pub struct Claims { pub struct Claims {
#[serde(default)] #[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>, pub tenant_id: Option<TenantId>,
pub scope: Scope, pub scope: Scope,
} }
@@ -45,106 +47,31 @@ impl Claims {
} }
} }
pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
impl SwappableJwtAuth {
pub fn new(jwt_auth: JwtAuth) -> Self {
SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
}
pub fn swap(&self, jwt_auth: JwtAuth) {
self.0.swap(Arc::new(jwt_auth));
}
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
self.0.load().decode(token)
}
}
impl std::fmt::Debug for SwappableJwtAuth {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Swappable({:?})", self.0.load())
}
}
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub struct AuthError(pub Cow<'static, str>);
impl Display for AuthError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<AuthError> for ApiError {
fn from(_value: AuthError) -> Self {
// Don't pass on the value of the AuthError as a precautionary measure.
// Being intentionally vague in public error communication hurts debugability
// but it is more secure.
ApiError::Forbidden("JWT authentication error".to_string())
}
}
pub struct JwtAuth { pub struct JwtAuth {
decoding_keys: Vec<DecodingKey>, decoding_key: DecodingKey,
validation: Validation, validation: Validation,
} }
impl JwtAuth { impl JwtAuth {
pub fn new(decoding_keys: Vec<DecodingKey>) -> Self { pub fn new(decoding_key: DecodingKey) -> Self {
let mut validation = Validation::default(); let mut validation = Validation::default();
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM]; validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
// The default 'required_spec_claims' is 'exp'. But we don't want to require // The default 'required_spec_claims' is 'exp'. But we don't want to require
// expiration. // expiration.
validation.required_spec_claims = [].into(); validation.required_spec_claims = [].into();
Self { Self {
decoding_keys, decoding_key,
validation, validation,
} }
} }
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> { pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
let metadata = key_path.metadata()?; let public_key = fs::read(key_path)?;
let decoding_keys = if metadata.is_dir() { Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
let mut keys = Vec::new();
for entry in fs::read_dir(key_path)? {
let path = entry?.path();
if !path.is_file() {
// Ignore directories (don't recurse)
continue;
}
let public_key = fs::read(path)?;
keys.push(DecodingKey::from_ed_pem(&public_key)?);
}
keys
} else if metadata.is_file() {
let public_key = fs::read(key_path)?;
vec![DecodingKey::from_ed_pem(&public_key)?]
} else {
anyhow::bail!("path is neither a directory or a file")
};
if decoding_keys.is_empty() {
anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
}
Ok(Self::new(decoding_keys))
} }
/// Attempt to decode the token with the internal decoding keys. pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
/// Ok(decode(token, &self.decoding_key, &self.validation)?)
/// The function tries the stored decoding keys in succession,
/// and returns the first yielding a successful result.
/// If there is no working decoding key, it returns the last error.
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
let mut res = None;
for decoding_key in &self.decoding_keys {
res = Some(decode(token, decoding_key, &self.validation));
if let Some(Ok(res)) = res {
return Ok(res);
}
}
if let Some(res) = res {
res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
} else {
Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
}
} }
} }
@@ -184,9 +111,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
"#; "#;
#[test] #[test]
fn test_decode() { fn test_decode() -> Result<(), anyhow::Error> {
let expected_claims = Claims { let expected_claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
scope: Scope::Tenant, scope: Scope::Tenant,
}; };
@@ -205,24 +132,28 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
// Check it can be validated with the public key // Check it can be validated with the public key
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims; let claims_from_token = auth.decode(encoded_eddsa)?.claims;
assert_eq!(claims_from_token, expected_claims); assert_eq!(claims_from_token, expected_claims);
Ok(())
} }
#[test] #[test]
fn test_encode() { fn test_encode() -> Result<(), anyhow::Error> {
let claims = Claims { let claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
scope: Scope::Tenant, scope: Scope::Tenant,
}; };
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap(); let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
// decode it back // decode it back
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
let decoded = auth.decode(&encoded).unwrap(); let decoded = auth.decode(&encoded)?;
assert_eq!(decoded.claims, claims); assert_eq!(decoded.claims, claims);
Ok(())
} }
} }

View File

@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
/// ///
/// See docs/rfcs/025-generation-numbers.md for detail on how generation /// See docs/rfcs/025-generation-numbers.md for detail on how generation
/// numbers are used. /// numbers are used.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
pub enum Generation { pub enum Generation {
// Generations with this magic value will not add a suffix to S3 keys, and will not // Generations with this magic value will not add a suffix to S3 keys, and will not
// be included in persisted index_part.json. This value is only to be used // be included in persisted index_part.json. This value is only to be used

View File

@@ -1,41 +0,0 @@
/// Useful type for asserting that expected bytes match reporting the bytes more readable
/// array-syntax compatible hex bytes.
///
/// # Usage
///
/// ```
/// use utils::Hex;
///
/// let actual = serialize_something();
/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
///
/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
/// // output suffixed with an array style length for easier comparisons.
/// assert_eq!(Hex(&actual), Hex(&expected));
///
/// // with `let expected = [0x68];` the error would had been:
/// // assertion `left == right` failed
/// // left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
/// // right: [0x68; 1]
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
/// ```
#[derive(PartialEq)]
pub struct Hex<'a>(pub &'a [u8]);
impl std::fmt::Debug for Hex<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[")?;
for (i, c) in self.0.chunks(16).enumerate() {
if i > 0 && !c.is_empty() {
writeln!(f, ", ")?;
}
for (j, b) in c.iter().enumerate() {
if j > 0 {
write!(f, ", ")?;
}
write!(f, "0x{b:02x}")?;
}
}
write!(f, "; {}]", self.0.len())
}
}

View File

@@ -1,4 +1,4 @@
use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::auth::{Claims, JwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::Context; use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::header::{HeaderName, AUTHORIZATION};
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future; use std::future::Future;
use std::str::FromStr; use std::str::FromStr;
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| { static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!( register_int_counter!(
"libmetrics_metric_handler_requests_total", "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
} }
} }
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
pub struct ChannelWriter {
buffer: BytesMut,
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
}
impl ChannelWriter {
pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
assert_ne!(buf_len, 0);
ChannelWriter {
// split about half off the buffer from the start, because we flush depending on
// capacity. first flush will come sooner than without this, but now resizes will
// have better chance of picking up the "other" half. not guaranteed of course.
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
}
}
pub fn flush0(&mut self) -> std::io::Result<usize> {
let n = self.buffer.len();
if n == 0 {
return Ok(0);
}
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
// throttle sending to allow reuse of our buffer in `write`.
self.tx.reserve().await.map_err(|_| ())?;
// now the response task has picked up the buffer and hopefully started
// sending it to the client.
Ok(())
});
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
self.written += n;
Ok(n)
}
pub fn flushed_bytes(&self) -> usize {
self.written
}
}
impl std::io::Write for ChannelWriter {
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
let remaining = self.buffer.capacity() - self.buffer.len();
let out_of_space = remaining < buf.len();
let original_len = buf.len();
if out_of_space {
let can_still_fit = buf.len() - remaining;
self.buffer.extend_from_slice(&buf[..can_still_fit]);
buf = &buf[can_still_fit..];
self.flush0()?;
}
// assume that this will often under normal operation just move the pointer back to the
// beginning of allocation, because previous split off parts are already sent and
// dropped.
self.buffer.extend_from_slice(buf);
Ok(original_len)
}
fn flush(&mut self) -> std::io::Result<()> {
self.flush0().map(|_| ())
}
}
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
SERVE_METRICS_COUNT.inc(); SERVE_METRICS_COUNT.inc();
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
struct ChannelWriter {
buffer: BytesMut,
tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
}
impl ChannelWriter {
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
assert_ne!(buf_len, 0);
ChannelWriter {
// split about half off the buffer from the start, because we flush depending on
// capacity. first flush will come sooner than without this, but now resizes will
// have better chance of picking up the "other" half. not guaranteed of course.
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
}
}
fn flush0(&mut self) -> std::io::Result<usize> {
let n = self.buffer.len();
if n == 0 {
return Ok(0);
}
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
// throttle sending to allow reuse of our buffer in `write`.
self.tx.reserve().await.map_err(|_| ())?;
// now the response task has picked up the buffer and hopefully started
// sending it to the client.
Ok(())
});
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
self.written += n;
Ok(n)
}
fn flushed_bytes(&self) -> usize {
self.written
}
}
impl std::io::Write for ChannelWriter {
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
let remaining = self.buffer.capacity() - self.buffer.len();
let out_of_space = remaining < buf.len();
let original_len = buf.len();
if out_of_space {
let can_still_fit = buf.len() - remaining;
self.buffer.extend_from_slice(&buf[..can_still_fit]);
buf = &buf[can_still_fit..];
self.flush0()?;
}
// assume that this will often under normal operation just move the pointer back to the
// beginning of allocation, because previous split off parts are already sent and
// dropped.
self.buffer.extend_from_slice(buf);
Ok(original_len)
}
fn flush(&mut self) -> std::io::Result<()> {
self.flush0().map(|_| ())
}
}
let started_at = std::time::Instant::now(); let started_at = std::time::Instant::now();
let (tx, rx) = mpsc::channel(1); let (tx, rx) = mpsc::channel(1);
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
} }
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>( pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>, provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
) -> Middleware<B, ApiError> { ) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move { Middleware::pre(move |req| async move {
if let Some(auth) = provide_auth(&req) { if let Some(auth) = provide_auth(&req) {
@@ -400,11 +400,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
})?; })?;
let token = parse_token(header_value)?; let token = parse_token(header_value)?;
let data = auth.decode(token).map_err(|err| { let data = auth
warn!("Authentication error: {err}"); .decode(token)
// Rely on From<AuthError> for ApiError impl .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
err
})?;
req.set_context(data.claims); req.set_context(data.claims);
} }
None => { None => {
@@ -452,11 +450,12 @@ where
pub fn check_permission_with( pub fn check_permission_with(
req: &Request<Body>, req: &Request<Body>,
check_permission: impl Fn(&Claims) -> Result<(), AuthError>, check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
) -> Result<(), ApiError> { ) -> Result<(), ApiError> {
match req.context::<Claims>() { match req.context::<Claims>() {
Some(claims) => Ok(check_permission(&claims) Some(claims) => {
.map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?), Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
}
None => Ok(()), // claims is None because auth is disabled None => Ok(()), // claims is None because auth is disabled
} }
} }

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::borrow::Cow; use std::borrow::Cow;
use std::error::Error as StdError; use std::error::Error as StdError;
use thiserror::Error; use thiserror::Error;
use tracing::{error, info, warn}; use tracing::{error, info};
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum ApiError { pub enum ApiError {
@@ -118,9 +118,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors // Print a stack trace for Internal Server errors
match api_error { match api_error {
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
warn!("Error processing HTTP request: {api_error:#}")
}
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),

View File

@@ -3,7 +3,6 @@ use std::{fmt, str::FromStr};
use anyhow::Context; use anyhow::Context;
use hex::FromHex; use hex::FromHex;
use rand::Rng; use rand::Rng;
use serde::de::Visitor;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use thiserror::Error; use thiserror::Error;
@@ -18,74 +17,12 @@ pub enum IdError {
/// ///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] ///
/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
/// Check the `serde_with::serde_as` documentation for options for more complex types.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
struct Id([u8; 16]); struct Id([u8; 16]);
impl Serialize for Id {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
self.0.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for Id {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> Visitor<'de> for IdVisitor {
type Value = Id;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 16])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 16] = Deserialize::deserialize(s)?;
Ok(Id::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Id::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
16,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
impl Id { impl Id {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
let mut arr = [0u8; 16]; let mut arr = [0u8; 16];
@@ -120,8 +57,6 @@ impl Id {
chunk[0] = HEX[((b >> 4) & 0xf) as usize]; chunk[0] = HEX[((b >> 4) & 0xf) as usize];
chunk[1] = HEX[(b & 0xf) as usize]; chunk[1] = HEX[(b & 0xf) as usize];
} }
// SAFETY: vec constructed out of `HEX`, it can only be ascii
unsafe { String::from_utf8_unchecked(buf) } unsafe { String::from_utf8_unchecked(buf) }
} }
} }
@@ -373,112 +308,3 @@ impl fmt::Display for NodeId {
write!(f, "{}", self.0) write!(f, "{}", self.0)
} }
} }
#[cfg(test)]
mod tests {
use serde_assert::{Deserializer, Serializer, Token, Tokens};
use crate::bin_ser::BeSer;
use super::*;
#[test]
fn test_id_serde_non_human_readable() {
let original_id = Id([
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
]);
let expected_tokens = Tokens(vec![
Token::Tuple { len: 16 },
Token::U8(173),
Token::U8(80),
Token::U8(132),
Token::U8(115),
Token::U8(129),
Token::U8(226),
Token::U8(72),
Token::U8(254),
Token::U8(170),
Token::U8(201),
Token::U8(135),
Token::U8(108),
Token::U8(199),
Token::U8(26),
Token::U8(228),
Token::U8(24),
Token::TupleEnd,
]);
let serializer = Serializer::builder().is_human_readable(false).build();
let serialized_tokens = original_id.serialize(&serializer).unwrap();
assert_eq!(serialized_tokens, expected_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(serialized_tokens)
.build();
let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
assert_eq!(deserialized_id, original_id);
}
#[test]
fn test_id_serde_human_readable() {
let original_id = Id([
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
]);
let expected_tokens = Tokens(vec![Token::Str(String::from(
"ad50847381e248feaac9876cc71ae418",
))]);
let serializer = Serializer::builder().is_human_readable(true).build();
let serialized_tokens = original_id.serialize(&serializer).unwrap();
assert_eq!(serialized_tokens, expected_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(Tokens(vec![Token::Str(String::from(
"ad50847381e248feaac9876cc71ae418",
))]))
.build();
assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
}
macro_rules! roundtrip_type {
($type:ty, $expected_bytes:expr) => {{
let expected_bytes: [u8; 16] = $expected_bytes;
let original_id = <$type>::from(expected_bytes);
let ser_bytes = original_id.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_id = <$type>::des(&ser_bytes).unwrap();
assert_eq!(des_id, original_id);
}};
}
#[test]
fn test_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(Id, expected_bytes);
}
#[test]
fn test_tenant_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(TenantId, expected_bytes);
}
#[test]
fn test_timeline_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(TimelineId, expected_bytes);
}
}

View File

@@ -1,6 +1,5 @@
//! `utils` is intended to be a place to put code that is shared //! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository. //! between other crates in this repository.
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod backoff; pub mod backoff;
@@ -25,10 +24,6 @@ pub mod auth;
// utility functions and helper traits for unified unique id generation/serialization etc. // utility functions and helper traits for unified unique id generation/serialization etc.
pub mod id; pub mod id;
mod hex;
pub use hex::Hex;
// http endpoint utils // http endpoint utils
pub mod http; pub mod http;
@@ -78,11 +73,6 @@ pub mod completion;
/// Reporting utilities /// Reporting utilities
pub mod error; pub mod error;
/// async timeout helper
pub mod timeout;
pub mod sync;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
/// ///
/// we have several cases: /// we have several cases:
@@ -138,21 +128,6 @@ macro_rules! project_git_version {
}; };
} }
/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
#[macro_export]
macro_rules! project_build_tag {
($const_identifier:ident) => {
const $const_identifier: &::core::primitive::str = {
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
::core::option::Option::Some(x) => ["build_tag-env:", x],
::core::option::Option::None => ["build_tag:", ""],
};
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
};
};
}
/// Re-export for `project_git_version` macro /// Re-export for `project_git_version` macro
#[doc(hidden)] #[doc(hidden)]
pub use const_format as __const_format; pub use const_format as __const_format;

View File

@@ -1,7 +1,7 @@
#![warn(missing_docs)] #![warn(missing_docs)]
use camino::Utf8Path; use camino::Utf8Path;
use serde::{de::Visitor, Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt; use std::fmt;
use std::ops::{Add, AddAssign}; use std::ops::{Add, AddAssign};
use std::str::FromStr; use std::str::FromStr;
@@ -13,114 +13,10 @@ use crate::seqwait::MonotonicCounter;
pub const XLOG_BLCKSZ: u32 = 8192; pub const XLOG_BLCKSZ: u32 = 8192;
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Lsn(pub u64); pub struct Lsn(pub u64);
impl Serialize for Lsn {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
self.0.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for Lsn {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct LsnVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> Visitor<'de> for LsnVisitor {
type Value = Lsn;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str(
"value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
)
} else {
formatter.write_str("value in form of integer(u64)")
}
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Lsn(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Lsn::from_str(v).map_err(|e| E::custom(e))
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(LsnVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_u64(LsnVisitor {
is_human_readable_deserializer: false,
})
}
}
}
/// Allows (de)serialization of an `Lsn` always as `u64`.
///
/// ### Example
///
/// ```rust
/// # use serde::{Serialize, Deserialize};
/// use utils::lsn::Lsn;
///
/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
/// struct Foo {
/// #[serde(with = "utils::lsn::serde_as_u64")]
/// always_u64: Lsn,
/// }
///
/// let orig = Foo { always_u64: Lsn(1234) };
///
/// let res = serde_json::to_string(&orig).unwrap();
/// assert_eq!(res, r#"{"always_u64":1234}"#);
///
/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
/// assert_eq!(foo, orig);
/// ```
///
pub mod serde_as_u64 {
use super::Lsn;
/// Serializes the Lsn as u64 disregarding the human readability of the format.
///
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
use serde::Serialize;
lsn.0.serialize(serializer)
}
/// Deserializes the Lsn as u64 disregarding the human readability of the format.
///
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
use serde::Deserialize;
u64::deserialize(deserializer).map(Lsn)
}
}
/// We tried to parse an LSN from a string, but failed /// We tried to parse an LSN from a string, but failed
#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[error("LsnParseError")] #[error("LsnParseError")]
@@ -368,13 +264,8 @@ impl MonotonicCounter<Lsn> for RecordLsn {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::bin_ser::BeSer;
use super::*; use super::*;
use serde::ser::Serialize;
use serde_assert::{Deserializer, Serializer, Token, Tokens};
#[test] #[test]
fn test_lsn_strings() { fn test_lsn_strings() {
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555))); assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -450,95 +341,4 @@ mod tests {
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678)); assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000)); assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
} }
#[test]
fn test_lsn_serde() {
let original_lsn = Lsn(0x0123456789abcdef);
let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
let expected_non_readable_tokens =
Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
// Testing human_readable ser/de
let serializer = Serializer::builder().is_human_readable(false).build();
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
assert_eq!(readable_ser_tokens, expected_readable_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(readable_ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
// Testing NON human_readable ser/de
let serializer = Serializer::builder().is_human_readable(true).build();
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(non_readable_ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
// Testing mismatching ser/de
let serializer = Serializer::builder().is_human_readable(false).build();
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(non_readable_ser_tokens)
.build();
Lsn::deserialize(&mut deserializer).unwrap_err();
let serializer = Serializer::builder().is_human_readable(true).build();
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(readable_ser_tokens)
.build();
Lsn::deserialize(&mut deserializer).unwrap_err();
}
#[test]
fn test_lsn_ensure_roundtrip() {
let original_lsn = Lsn(0xaaaabbbb);
let serializer = Serializer::builder().is_human_readable(false).build();
let ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
}
#[test]
fn test_lsn_bincode_serde() {
let lsn = Lsn(0x0123456789abcdef);
let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
let ser_bytes = lsn.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_lsn = Lsn::des(&ser_bytes).unwrap();
assert_eq!(des_lsn, lsn);
}
#[test]
fn test_lsn_bincode_ensure_roundtrip() {
let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
let ser_bytes = original_lsn.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_lsn = Lsn::des(&ser_bytes).unwrap();
assert_eq!(des_lsn, original_lsn);
}
} }

View File

@@ -3,6 +3,7 @@ use std::time::{Duration, SystemTime};
use bytes::{Buf, BufMut, Bytes, BytesMut}; use bytes::{Buf, BufMut, Bytes, BytesMut};
use pq_proto::{read_cstr, PG_EPOCH}; use pq_proto::{read_cstr, PG_EPOCH};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use tracing::{trace, warn}; use tracing::{trace, warn};
use crate::lsn::Lsn; use crate::lsn::Lsn;
@@ -14,17 +15,21 @@ use crate::lsn::Lsn;
/// ///
/// serde Serialize is used only for human readable dump to json (e.g. in /// serde Serialize is used only for human readable dump to json (e.g. in
/// safekeepers debug_dump). /// safekeepers debug_dump).
#[serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct PageserverFeedback { pub struct PageserverFeedback {
/// Last known size of the timeline. Used to enforce timeline size limit. /// Last known size of the timeline. Used to enforce timeline size limit.
pub current_timeline_size: u64, pub current_timeline_size: u64,
/// LSN last received and ingested by the pageserver. Controls backpressure. /// LSN last received and ingested by the pageserver. Controls backpressure.
#[serde_as(as = "DisplayFromStr")]
pub last_received_lsn: Lsn, pub last_received_lsn: Lsn,
/// LSN up to which data is persisted by the pageserver to its local disc. /// LSN up to which data is persisted by the pageserver to its local disc.
/// Controls backpressure. /// Controls backpressure.
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn, pub disk_consistent_lsn: Lsn,
/// LSN up to which data is persisted by the pageserver on s3; safekeepers /// LSN up to which data is persisted by the pageserver on s3; safekeepers
/// consider WAL before it can be removed. /// consider WAL before it can be removed.
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn, pub remote_consistent_lsn: Lsn,
// Serialize with RFC3339 format. // Serialize with RFC3339 format.
#[serde(with = "serde_systemtime")] #[serde(with = "serde_systemtime")]

View File

@@ -125,9 +125,6 @@ where
// Wake everyone with an error. // Wake everyone with an error.
let mut internal = self.internal.lock().unwrap(); let mut internal = self.internal.lock().unwrap();
// Block any future waiters from starting
internal.shutdown = true;
// This will steal the entire waiters map. // This will steal the entire waiters map.
// When we drop it all waiters will be woken. // When we drop it all waiters will be woken.
mem::take(&mut internal.waiters) mem::take(&mut internal.waiters)

View File

@@ -1,7 +1,6 @@
/// Immediately terminate the calling process without calling /// Immediately terminate the calling process without calling
/// atexit callbacks, C runtime destructors etc. We mainly use /// atexit callbacks, C runtime destructors etc. We mainly use
/// this to protect coverage data from concurrent writes. /// this to protect coverage data from concurrent writes.
pub fn exit_now(code: u8) -> ! { pub fn exit_now(code: u8) {
// SAFETY: exiting is safe, the ffi is not safe
unsafe { nix::libc::_exit(code as _) }; unsafe { nix::libc::_exit(code as _) };
} }

View File

@@ -1,3 +0,0 @@
pub mod heavier_once_cell;
pub mod gate;

View File

@@ -1,158 +0,0 @@
use std::{sync::Arc, time::Duration};
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
///
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
/// the resource calls `close()` when they want to ensure that all holders of guards
/// have released them, and that no future guards will be issued.
pub struct Gate {
/// Each caller of enter() takes one unit from the semaphore. In close(), we
/// take all the units to ensure all GateGuards are destroyed.
sem: Arc<tokio::sync::Semaphore>,
/// For observability only: a name that will be used to log warnings if a particular
/// gate is holding up shutdown
name: String,
}
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
/// not complete.
#[derive(Debug)]
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
async fn warn_if_stuck<Fut: std::future::Future>(
fut: Fut,
name: &str,
warn_period: std::time::Duration,
) -> <Fut as std::future::Future>::Output {
let started = std::time::Instant::now();
let mut fut = std::pin::pin!(fut);
loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => return ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
}
}
}
}
#[derive(Debug)]
pub enum GateError {
GateClosed,
}
impl Gate {
const MAX_UNITS: u32 = u32::MAX;
pub fn new(name: String) -> Self {
Self {
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
name,
}
}
/// Acquire a guard that will prevent close() calls from completing. If close()
/// was already called, this will return an error which should be interpreted
/// as "shutting down".
///
/// This function would typically be used from e.g. request handlers. While holding
/// the guard returned from this function, it is important to respect a CancellationToken
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
/// also contain a CancellationToken.
pub fn enter(&self) -> Result<GateGuard, GateError> {
self.sem
.clone()
.try_acquire_owned()
.map(GateGuard)
.map_err(|_| GateError::GateClosed)
}
/// Types with a shutdown() method and a gate should call this method at the
/// end of shutdown, to ensure that all GateGuard holders are done.
///
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
/// important that the holders of such guards are respecting a CancellationToken which has
/// been cancelled before entering this function.
pub async fn close(&self) {
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
}
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
/// the CancellationToken on such types is analogous to "Did shutdown start?"
pub fn close_complete(&self) -> bool {
self.sem.is_closed()
}
async fn do_close(&self) {
tracing::debug!(gate = self.name, "Closing Gate...");
match self.sem.acquire_many(Self::MAX_UNITS).await {
Ok(_units) => {
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
self.sem.close();
}
Err(_) => {
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
// This is legal. Timeline::shutdown for example is not protected from being called more than
// once.
tracing::debug!(gate = self.name, "Double close")
}
}
tracing::debug!(gate = self.name, "Closed Gate.")
}
}
#[cfg(test)]
mod tests {
use futures::FutureExt;
use super::*;
#[tokio::test]
async fn test_idle_gate() {
// Having taken no gates, we should not be blocked in close
let gate = Gate::new("test".to_string());
gate.close().await;
// If a guard is dropped before entering, close should not be blocked
let gate = Gate::new("test".to_string());
let guard = gate.enter().unwrap();
drop(guard);
gate.close().await;
// Entering a closed guard fails
gate.enter().expect_err("enter should fail after close");
}
#[tokio::test]
async fn test_busy_gate() {
let gate = Gate::new("test".to_string());
let guard = gate.enter().unwrap();
let mut close_fut = std::pin::pin!(gate.close());
// Close should be blocked
assert!(close_fut.as_mut().now_or_never().is_none());
// Attempting to enter() should fail, even though close isn't done yet.
gate.enter()
.expect_err("enter should fail after entering close");
drop(guard);
// Guard is gone, close should finish
assert!(close_fut.as_mut().now_or_never().is_some());
// Attempting to enter() is still forbidden
gate.enter().expect_err("enter should fail finishing close");
}
}

View File

@@ -1,383 +0,0 @@
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex, MutexGuard,
};
use tokio::sync::Semaphore;
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
/// for the duration of initialization.
///
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
///
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
pub struct OnceCell<T> {
inner: Mutex<Inner<T>>,
initializers: AtomicUsize,
}
impl<T> Default for OnceCell<T> {
/// Create new uninitialized [`OnceCell`].
fn default() -> Self {
Self {
inner: Default::default(),
initializers: AtomicUsize::new(0),
}
}
}
/// Semaphore is the current state:
/// - open semaphore means the value is `None`, not yet initialized
/// - closed semaphore means the value has been initialized
#[derive(Debug)]
struct Inner<T> {
init_semaphore: Arc<Semaphore>,
value: Option<T>,
}
impl<T> Default for Inner<T> {
fn default() -> Self {
Self {
init_semaphore: Arc::new(Semaphore::new(1)),
value: None,
}
}
}
impl<T> OnceCell<T> {
/// Creates an already initialized `OnceCell` with the given value.
pub fn new(value: T) -> Self {
let sem = Semaphore::new(1);
sem.close();
Self {
inner: Mutex::new(Inner {
init_semaphore: Arc::new(sem),
value: Some(value),
}),
initializers: AtomicUsize::new(0),
}
}
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
/// returning the guard.
///
/// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
///
/// Initialization is panic-safe and cancellation-safe.
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
where
F: FnOnce(InitPermit) -> Fut,
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
{
let sem = {
let guard = self.inner.lock().unwrap();
if guard.value.is_some() {
return Ok(Guard(guard));
}
guard.init_semaphore.clone()
};
let permit = {
// increment the count for the duration of queued
let _guard = CountWaitingInitializers::start(self);
sem.acquire_owned().await
};
match permit {
Ok(permit) => {
let permit = InitPermit(permit);
let (value, _permit) = factory(permit).await?;
let guard = self.inner.lock().unwrap();
Ok(Self::set0(value, guard))
}
Err(_closed) => {
let guard = self.inner.lock().unwrap();
assert!(
guard.value.is_some(),
"semaphore got closed, must be initialized"
);
return Ok(Guard(guard));
}
}
}
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
/// to complete initializing the inner value.
///
/// # Panics
///
/// If the inner has already been initialized.
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
let guard = self.inner.lock().unwrap();
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
// give more permits right now.
if guard.init_semaphore.try_acquire().is_ok() {
drop(guard);
panic!("permit is of wrong origin");
}
Self::set0(value, guard)
}
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
if guard.value.is_some() {
drop(guard);
unreachable!("we won permit, must not be initialized");
}
guard.value = Some(value);
guard.init_semaphore.close();
Guard(guard)
}
/// Returns a guard to an existing initialized value, if any.
pub fn get(&self) -> Option<Guard<'_, T>> {
let guard = self.inner.lock().unwrap();
if guard.value.is_some() {
Some(Guard(guard))
} else {
None
}
}
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
pub fn initializer_count(&self) -> usize {
self.initializers.load(Ordering::Relaxed)
}
}
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
/// initializing task for example at the end of initialization.
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
impl<'a, T> CountWaitingInitializers<'a, T> {
fn start(target: &'a OnceCell<T>) -> Self {
target.initializers.fetch_add(1, Ordering::Relaxed);
CountWaitingInitializers(target)
}
}
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
fn drop(&mut self) {
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
}
}
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
/// initialized value.
#[derive(Debug)]
pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
impl<T> std::ops::Deref for Guard<'_, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
self.0
.value
.as_ref()
.expect("guard is not created unless value has been initialized")
}
}
impl<T> std::ops::DerefMut for Guard<'_, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.0
.value
.as_mut()
.expect("guard is not created unless value has been initialized")
}
}
impl<'a, T> Guard<'a, T> {
/// Take the current value, and a new permit for it's deinitialization.
///
/// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
let mut swapped = Inner::default();
let permit = swapped
.init_semaphore
.clone()
.try_acquire_owned()
.expect("we just created this");
std::mem::swap(&mut *self.0, &mut swapped);
swapped
.value
.map(|v| (v, InitPermit(permit)))
.expect("guard is not created unless value has been initialized")
}
}
/// Type held by OnceCell (de)initializing task.
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
#[cfg(test)]
mod tests {
use super::*;
use std::{
convert::Infallible,
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};
#[tokio::test]
async fn many_initializers() {
#[derive(Default, Debug)]
struct Counters {
factory_got_to_run: AtomicUsize,
future_polled: AtomicUsize,
winners: AtomicUsize,
}
let initializers = 100;
let cell = Arc::new(OnceCell::default());
let counters = Arc::new(Counters::default());
let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
let mut js = tokio::task::JoinSet::new();
for i in 0..initializers {
js.spawn({
let cell = cell.clone();
let counters = counters.clone();
let barrier = barrier.clone();
async move {
barrier.wait().await;
let won = {
let g = cell
.get_or_init(|permit| {
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
async {
counters.future_polled.fetch_add(1, Ordering::Relaxed);
Ok::<_, Infallible>((i, permit))
}
})
.await
.unwrap();
*g == i
};
if won {
counters.winners.fetch_add(1, Ordering::Relaxed);
}
}
});
}
barrier.wait().await;
while let Some(next) = js.join_next().await {
next.expect("no panics expected");
}
let mut counters = Arc::try_unwrap(counters).unwrap();
assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
assert_eq!(*counters.future_polled.get_mut(), 1);
assert_eq!(*counters.winners.get_mut(), 1);
}
#[tokio::test(start_paused = true)]
async fn reinit_waits_for_deinit() {
// with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
let sleep_for = Duration::from_secs(1);
let initial = 42;
let reinit = 1;
let cell = Arc::new(OnceCell::new(initial));
let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
let jh = tokio::spawn({
let cell = cell.clone();
let deinitialization_started = deinitialization_started.clone();
async move {
let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
assert_eq!(answer, initial);
deinitialization_started.wait().await;
tokio::time::sleep(sleep_for).await;
}
});
deinitialization_started.wait().await;
let started_at = tokio::time::Instant::now();
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
.await
.unwrap();
let elapsed = started_at.elapsed();
assert!(
elapsed >= sleep_for,
"initialization should had taken at least the time time slept with permit"
);
jh.await.unwrap();
assert_eq!(*cell.get().unwrap(), reinit);
}
#[test]
fn reinit_with_deinit_permit() {
let cell = Arc::new(OnceCell::new(42));
let (mol, permit) = cell.get().unwrap().take_and_deinit();
cell.set(5, permit);
assert_eq!(*cell.get().unwrap(), 5);
let (five, permit) = cell.get().unwrap().take_and_deinit();
assert_eq!(5, five);
cell.set(mol, permit);
assert_eq!(*cell.get().unwrap(), 42);
}
#[tokio::test]
async fn initialization_attemptable_until_ok() {
let cell = OnceCell::default();
for _ in 0..10 {
cell.get_or_init(|_permit| async { Err("whatever error") })
.await
.unwrap_err();
}
let g = cell
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
.await
.unwrap();
assert_eq!(*g, "finally success");
}
#[tokio::test]
async fn initialization_is_cancellation_safe() {
let cell = OnceCell::default();
let barrier = tokio::sync::Barrier::new(2);
let initializer = cell.get_or_init(|permit| async {
barrier.wait().await;
futures::future::pending::<()>().await;
Ok::<_, Infallible>(("never reached", permit))
});
tokio::select! {
_ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
_ = barrier.wait() => {}
};
// now initializer is dropped
assert!(cell.get().is_none());
let g = cell
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
.await
.unwrap();
assert_eq!(*g, "now initialized");
}
}

View File

@@ -1,37 +0,0 @@
use std::time::Duration;
use tokio_util::sync::CancellationToken;
pub enum TimeoutCancellableError {
Timeout,
Cancelled,
}
/// Wrap [`tokio::time::timeout`] with a CancellationToken.
///
/// This wrapper is appropriate for any long running operation in a task
/// that ought to respect a CancellationToken (which means most tasks).
///
/// The only time you should use a bare tokio::timeout is when the future `F`
/// itself respects a CancellationToken: otherwise, always use this wrapper
/// with your CancellationToken to ensure that your task does not hold up
/// graceful shutdown.
pub async fn timeout_cancellable<F>(
duration: Duration,
cancel: &CancellationToken,
future: F,
) -> Result<F::Output, TimeoutCancellableError>
where
F: std::future::Future,
{
tokio::select!(
r = tokio::time::timeout(duration, future) => {
r.map_err(|_| TimeoutCancellableError::Timeout)
},
_ = cancel.cancelled() => {
Err(TimeoutCancellableError::Cancelled)
}
)
}

View File

@@ -19,12 +19,13 @@ inotify.workspace = true
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
sysinfo.workspace = true sysinfo.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] } tokio.workspace = true
tokio-postgres.workspace = true tokio-postgres.workspace = true
tokio-stream.workspace = true tokio-stream.workspace = true
tokio-util.workspace = true tokio-util.workspace = true
tracing.workspace = true tracing.workspace = true
tracing-subscriber.workspace = true tracing-subscriber.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
cgroups-rs = "0.3.3" cgroups-rs = "0.3.3"

View File

@@ -21,6 +21,11 @@ pub struct FileCacheState {
#[derive(Debug)] #[derive(Debug)]
pub struct FileCacheConfig { pub struct FileCacheConfig {
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
/// memory available for the cgroup.
pub(crate) in_memory: bool,
/// The size of the file cache, in terms of the size of the resource it consumes /// The size of the file cache, in terms of the size of the resource it consumes
/// (currently: only memory) /// (currently: only memory)
/// ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
spread_factor: f64, spread_factor: f64,
} }
impl Default for FileCacheConfig { impl FileCacheConfig {
fn default() -> Self { pub fn default_in_memory() -> Self {
Self { Self {
in_memory: true,
// 75 %
resource_multiplier: 0.75,
// 640 MiB; (512 + 128)
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
// ensure any increase in file cache size is split 90-10 with 10% to other memory
spread_factor: 0.1,
}
}
pub fn default_on_disk() -> Self {
Self {
in_memory: false,
resource_multiplier: 0.75, resource_multiplier: 0.75,
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
// memory, the kernel will just evict from its page cache, rather than e.g. killing // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
spread_factor: 0.1, spread_factor: 0.1,
} }
} }
}
impl FileCacheConfig {
/// Make sure fields of the config are consistent. /// Make sure fields of the config are consistent.
pub fn validate(&self) -> anyhow::Result<()> { pub fn validate(&self) -> anyhow::Result<()> {
// Single field validity // Single field validity

View File

@@ -1,5 +1,3 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
#![cfg(target_os = "linux")] #![cfg(target_os = "linux")]
use anyhow::Context; use anyhow::Context;
@@ -41,6 +39,16 @@ pub struct Args {
#[arg(short, long)] #[arg(short, long)]
pub pgconnstr: Option<String>, pub pgconnstr: Option<String>,
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
/// kernel's page cache), and therefore should not count against available memory.
//
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
// during the switch away from an in-memory file cache, we had to default to the previous
// behavior.
#[arg(long)]
pub file_cache_on_disk: bool,
/// The address we should listen on for connection requests. For the /// The address we should listen on for connection requests. For the
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369. /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
#[arg(short, long)] #[arg(short, long)]

View File

@@ -156,7 +156,10 @@ impl Runner {
// memory limits. // memory limits.
if let Some(connstr) = &args.pgconnstr { if let Some(connstr) = &args.pgconnstr {
info!("initializing file cache"); info!("initializing file cache");
let config = FileCacheConfig::default(); let config = match args.file_cache_on_disk {
true => FileCacheConfig::default_on_disk(),
false => FileCacheConfig::default_in_memory(),
};
let mut file_cache = FileCacheState::new(connstr, config, token.clone()) let mut file_cache = FileCacheState::new(connstr, config, token.clone())
.await .await
@@ -184,7 +187,10 @@ impl Runner {
info!("file cache size actually got set to {actual_size}") info!("file cache size actually got set to {actual_size}")
} }
file_cache_disk_size = actual_size; if args.file_cache_on_disk {
file_cache_disk_size = actual_size;
}
state.filecache = Some(file_cache); state.filecache = Some(file_cache);
} }
@@ -233,11 +239,17 @@ impl Runner {
let requested_mem = target.mem; let requested_mem = target.mem;
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes); let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
let expected_file_cache_size = self let (expected_file_cache_size, expected_file_cache_disk_size) = self
.filecache .filecache
.as_ref() .as_ref()
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory)) .map(|file_cache| {
.unwrap_or(0); let size = file_cache.config.calculate_cache_size(usable_system_memory);
match file_cache.config.in_memory {
true => (size, 0),
false => (size, size),
}
})
.unwrap_or((0, 0));
if let Some(cgroup) = &self.cgroup { if let Some(cgroup) = &self.cgroup {
let (last_time, last_history) = *cgroup.watcher.borrow(); let (last_time, last_history) = *cgroup.watcher.borrow();
@@ -261,7 +273,7 @@ impl Runner {
let new_threshold = self let new_threshold = self
.config .config
.cgroup_threshold(usable_system_memory, expected_file_cache_size); .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
let current = last_history.avg_non_reclaimable; let current = last_history.avg_non_reclaimable;
@@ -288,10 +300,13 @@ impl Runner {
.set_file_cache_size(expected_file_cache_size) .set_file_cache_size(expected_file_cache_size)
.await .await
.context("failed to set file cache size")?; .context("failed to set file cache size")?;
file_cache_disk_size = actual_usage; if !file_cache.config.in_memory {
file_cache_disk_size = actual_usage;
}
let message = format!( let message = format!(
"set file cache size to {} MiB", "set file cache size to {} MiB (in memory = {})",
bytes_to_mebibytes(actual_usage), bytes_to_mebibytes(actual_usage),
file_cache.config.in_memory,
); );
info!("downscale: {message}"); info!("downscale: {message}");
status.push(message); status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
.set_file_cache_size(expected_usage) .set_file_cache_size(expected_usage)
.await .await
.context("failed to set file cache size")?; .context("failed to set file cache size")?;
file_cache_disk_size = actual_usage; if !file_cache.config.in_memory {
file_cache_disk_size = actual_usage;
}
if actual_usage != expected_usage { if actual_usage != expected_usage {
warn!( warn!(

View File

@@ -188,7 +188,6 @@ extern "C" fn recovery_download(
} }
} }
#[allow(clippy::unnecessary_cast)]
extern "C" fn wal_read( extern "C" fn wal_read(
sk: *mut Safekeeper, sk: *mut Safekeeper,
buf: *mut ::std::os::raw::c_char, buf: *mut ::std::os::raw::c_char,
@@ -422,7 +421,6 @@ impl std::fmt::Display for Level {
} }
/// Take ownership of `Vec<u8>` from StringInfoData. /// Take ownership of `Vec<u8>` from StringInfoData.
#[allow(clippy::unnecessary_cast)]
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> { pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
if pg.data.is_null() { if pg.data.is_null() {
return None; return None;

View File

@@ -186,7 +186,7 @@ impl Wrapper {
.unwrap() .unwrap()
.into_bytes_with_nul(); .into_bytes_with_nul();
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity()); assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char; let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void; let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

View File

@@ -1,21 +1,22 @@
use utils::auth::{AuthError, Claims, Scope}; use anyhow::{bail, Result};
use utils::auth::{Claims, Scope};
use utils::id::TenantId; use utils::id::TenantId;
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> { pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
match (&claims.scope, tenant_id) { match (&claims.scope, tenant_id) {
(Scope::Tenant, None) => Err(AuthError( (Scope::Tenant, None) => {
"Attempt to access management api with tenant scope. Permission denied".into(), bail!("Attempt to access management api with tenant scope. Permission denied")
)), }
(Scope::Tenant, Some(tenant_id)) => { (Scope::Tenant, Some(tenant_id)) => {
if claims.tenant_id.unwrap() != tenant_id { if claims.tenant_id.unwrap() != tenant_id {
return Err(AuthError("Tenant id mismatch. Permission denied".into())); bail!("Tenant id mismatch. Permission denied")
} }
Ok(()) Ok(())
} }
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::SafekeeperData, _) => Err(AuthError( (Scope::SafekeeperData, _) => {
"SafekeeperData scope makes no sense for Pageserver".into(), bail!("SafekeeperData scope makes no sense for Pageserver")
)), }
} }
} }

View File

@@ -34,15 +34,11 @@ use postgres_backend::AuthType;
use utils::logging::TracingErrorLayerEnablement; use utils::logging::TracingErrorLayerEnablement;
use utils::signals::ShutdownSignals; use utils::signals::ShutdownSignals;
use utils::{ use utils::{
auth::{JwtAuth, SwappableJwtAuth}, auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
logging, project_build_tag, project_git_version,
sentry_init::init_sentry,
signals::Signal,
tcp_listener, tcp_listener,
}; };
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
const PID_FILE_NAME: &str = "pageserver.pid"; const PID_FILE_NAME: &str = "pageserver.pid";
@@ -262,12 +258,11 @@ fn start_pageserver(
// A changed version string indicates changed software. // A changed version string indicates changed software.
// A changed launch timestamp indicates a pageserver restart. // A changed launch timestamp indicates a pageserver restart.
info!( info!(
"version: {} launch_timestamp: {} build_tag: {}", "version: {} launch_timestamp: {}",
version(), version(),
launch_ts.to_string(), launch_ts.to_string()
BUILD_TAG,
); );
set_build_info_metric(GIT_VERSION, BUILD_TAG); set_build_info_metric(GIT_VERSION);
set_launch_timestamp_metric(launch_ts); set_launch_timestamp_metric(launch_ts);
pageserver::preinitialize_metrics(); pageserver::preinitialize_metrics();
@@ -324,12 +319,13 @@ fn start_pageserver(
let http_auth; let http_auth;
let pg_auth; let pg_auth;
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
// unwrap is ok because check is performed when creating config, so path is set and exists // unwrap is ok because check is performed when creating config, so path is set and file exists
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
info!("Loading public key(s) for verifying JWT tokens from {key_path:?}"); info!(
"Loading public key for verifying JWT tokens from {:#?}",
let jwt_auth = JwtAuth::from_key_path(key_path)?; key_path
let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth)); );
let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
http_auth = match &conf.http_auth_type { http_auth = match &conf.http_auth_type {
AuthType::Trust => None, AuthType::Trust => None,
@@ -412,7 +408,7 @@ fn start_pageserver(
// Scan the local 'tenants/' directory and start loading the tenants // Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client(); let deletion_queue_client = deletion_queue.new_client();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf, conf,
TenantSharedResources { TenantSharedResources {
broker_client: broker_client.clone(), broker_client: broker_client.clone(),
@@ -422,7 +418,6 @@ fn start_pageserver(
order, order,
shutdown_pageserver.clone(), shutdown_pageserver.clone(),
))?; ))?;
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({ BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx; let init_done_rx = init_done_rx;
@@ -551,7 +546,6 @@ fn start_pageserver(
let router_state = Arc::new( let router_state = Arc::new(
http::routes::State::new( http::routes::State::new(
conf, conf,
tenant_manager,
http_auth.clone(), http_auth.clone(),
remote_storage.clone(), remote_storage.clone(),
broker_client.clone(), broker_client.clone(),

View File

@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt; use crate::tenant::config::TenantConfOpt;
use crate::tenant::{ use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
TIMELINES_SEGMENT_NAME,
}; };
use crate::{ use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -161,7 +162,7 @@ pub struct PageServerConf {
pub http_auth_type: AuthType, pub http_auth_type: AuthType,
/// authentication method for libpq connections from compute /// authentication method for libpq connections from compute
pub pg_auth_type: AuthType, pub pg_auth_type: AuthType,
/// Path to a file or directory containing public key(s) for verifying JWT tokens. /// Path to a file containing public key for verifying JWT tokens.
/// Used for both mgmt and compute auth, if enabled. /// Used for both mgmt and compute auth, if enabled.
pub auth_validation_public_key_path: Option<Utf8PathBuf>, pub auth_validation_public_key_path: Option<Utf8PathBuf>,
@@ -632,6 +633,11 @@ impl PageServerConf {
self.tenants_path().join(tenant_id.to_string()) self.tenants_path().join(tenant_id.to_string())
} }
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_ATTACHING_MARKER_FILENAME)
}
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
} }
@@ -1314,6 +1320,12 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!( assert_eq!(
parsed_remote_storage_config, parsed_remote_storage_config,
RemoteStorageConfig { RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
)
.unwrap(),
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
.unwrap(),
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
}, },
"Remote storage config should correctly parse the local FS config and fill other storage defaults" "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1374,6 +1386,8 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!( assert_eq!(
parsed_remote_storage_config, parsed_remote_storage_config,
RemoteStorageConfig { RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage: RemoteStorageKind::AwsS3(S3Config { storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: bucket_name.clone(), bucket_name: bucket_name.clone(),
bucket_region: bucket_region.clone(), bucket_region: bucket_region.clone(),
@@ -1465,6 +1479,8 @@ threshold = "20m"
Some(DiskUsageEvictionTaskConfig { Some(DiskUsageEvictionTaskConfig {
max_usage_pct: Percent::new(80).unwrap(), max_usage_pct: Percent::new(80).unwrap(),
min_avail_bytes: 0, min_avail_bytes: 0,
target_avail_bytes: None,
target_usage_pct: None,
period: Duration::from_secs(10), period: Duration::from_secs(10),
#[cfg(feature = "testing")] #[cfg(feature = "testing")]
mock_statvfs: None, mock_statvfs: None,

View File

@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
continue; continue;
} }
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) { if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics. // We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics. // Same for the loop that fetches computed metrics.

View File

@@ -3,6 +3,7 @@ use anyhow::Context;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use consumption_metrics::EventType; use consumption_metrics::EventType;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use serde_with::serde_as;
use std::{sync::Arc, time::SystemTime}; use std::{sync::Arc, time::SystemTime};
use utils::{ use utils::{
id::{TenantId, TimelineId}, id::{TenantId, TimelineId},
@@ -41,10 +42,13 @@ pub(super) enum Name {
/// ///
/// This is a denormalization done at the MetricsKey const methods; these should not be constructed /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
/// elsewhere. /// elsewhere.
#[serde_with::serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(crate) struct MetricsKey { pub(crate) struct MetricsKey {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId, pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>, pub(super) timeline_id: Option<TimelineId>,
@@ -202,6 +206,7 @@ pub(super) async fn collect_all_metrics(
None None
} else { } else {
crate::tenant::mgr::get_tenant(id, true) crate::tenant::mgr::get_tenant(id, true)
.await
.ok() .ok()
.map(|tenant| (id, tenant)) .map(|tenant| (id, tenant))
} }

View File

@@ -1,4 +1,5 @@
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use serde_with::serde_as;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::Instrument; use tracing::Instrument;
@@ -6,9 +7,12 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
/// How the metrics from pageserver are identified. /// How the metrics from pageserver are identified.
#[serde_with::serde_as]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
struct Ids { struct Ids {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId, pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>, pub(super) timeline_id: Option<TimelineId>,
} }

View File

@@ -57,10 +57,7 @@ impl ControlPlaneClient {
if let Some(jwt) = &conf.control_plane_api_token { if let Some(jwt) = &conf.control_plane_api_token {
let mut headers = hyper::HeaderMap::new(); let mut headers = hyper::HeaderMap::new();
headers.insert( headers.insert("Authorization", jwt.get_contents().parse().unwrap());
"Authorization",
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
);
client = client.default_headers(headers); client = client.default_headers(headers);
} }
@@ -147,7 +144,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
Ok(response Ok(response
.tenants .tenants
.into_iter() .into_iter()
.map(|t| (t.id, Generation::new(t.gen))) .map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>()) .collect::<HashMap<_, _>>())
} }

View File

@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::metrics; use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path; use crate::tenant::remote_timeline_client::remote_timeline_path;
use crate::virtual_file::MaybeFatalIo;
use crate::virtual_file::VirtualFile; use crate::virtual_file::VirtualFile;
use anyhow::Context; use anyhow::Context;
use camino::Utf8PathBuf; use camino::Utf8PathBuf;
@@ -18,6 +17,7 @@ use hex::FromHex;
use remote_storage::{GenericRemoteStorage, RemotePath}; use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize; use serde::Deserialize;
use serde::Serialize; use serde::Serialize;
use serde_with::serde_as;
use thiserror::Error; use thiserror::Error;
use tokio; use tokio;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
@@ -214,6 +214,7 @@ where
/// during recovery as startup. /// during recovery as startup.
const TEMP_SUFFIX: &str = "tmp"; const TEMP_SUFFIX: &str = "tmp";
#[serde_as]
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
struct DeletionList { struct DeletionList {
/// Serialization version, for future use /// Serialization version, for future use
@@ -242,6 +243,7 @@ struct DeletionList {
validated: bool, validated: bool,
} }
#[serde_as]
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
struct DeletionHeader { struct DeletionHeader {
/// Serialization version, for future use /// Serialization version, for future use
@@ -269,9 +271,7 @@ impl DeletionHeader {
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
.await .await
.maybe_fatal_err("save deletion header")?; .map_err(Into::into)
Ok(())
} }
} }
@@ -360,7 +360,6 @@ impl DeletionList {
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
.await .await
.maybe_fatal_err("save deletion list")
.map_err(Into::into) .map_err(Into::into)
} }
} }
@@ -893,6 +892,14 @@ mod test {
std::fs::create_dir_all(remote_fs_dir)?; std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
let storage_config = RemoteStorageConfig { let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
}; };
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

View File

@@ -55,24 +55,21 @@ impl Deleter {
/// Wrap the remote `delete_objects` with a failpoint /// Wrap the remote `delete_objects` with a failpoint
async fn remote_delete(&self) -> Result<(), anyhow::Error> { async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
// A backoff::retry is used here for two reasons: // A backoff::retry is used here for two reasons:
// - To provide a backoff rather than busy-polling the API on errors // - To provide a backoff rather than busy-polling the API on errors
// - To absorb transient 429/503 conditions without hitting our error // - To absorb transient 429/503 conditions without hitting our error
// logging path for issues deleting objects. // logging path for issues deleting objects.
backoff::retry( backoff::retry(
|| async { || async { self.remote_storage.delete_objects(&self.accumulator).await },
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
});
self.remote_storage.delete_objects(&self.accumulator).await
},
|_| false, |_| false,
3, 3,
10, 10,

View File

@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics; use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName; use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
// The number of keys in a DeletionList before we will proactively persist it // The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order // (without reaching a flush deadline). This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
debug!("Deletion header {header_path} not found, first start?"); debug!("Deletion header {header_path} not found, first start?");
Ok(None) Ok(None)
} else { } else {
on_fatal_io_error(&e, "reading deletion header"); Err(anyhow::anyhow!(e))
} }
} }
} }
@@ -218,9 +216,16 @@ impl ListWriter {
self.pending.sequence = validated_sequence + 1; self.pending.sequence = validated_sequence + 1;
let deletion_directory = self.conf.deletion_prefix(); let deletion_directory = self.conf.deletion_prefix();
let mut dir = tokio::fs::read_dir(&deletion_directory) let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
.await Ok(d) => d,
.fatal_err("read deletion directory"); Err(e) => {
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let list_name_pattern = let list_name_pattern =
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap(); Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
let temp_extension = format!(".{TEMP_SUFFIX}"); let temp_extension = format!(".{TEMP_SUFFIX}");
let header_path = self.conf.deletion_header_path(); let header_path = self.conf.deletion_header_path();
let mut seqs: Vec<u64> = Vec::new(); let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") { while let Some(dentry) = dir.next_entry().await? {
let file_name = dentry.file_name(); let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy(); let dentry_str = file_name.to_string_lossy();
@@ -241,9 +246,11 @@ impl ListWriter {
info!("Cleaning up temporary file {dentry_str}"); info!("Cleaning up temporary file {dentry_str}");
let absolute_path = let absolute_path =
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path")); deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
tokio::fs::remove_file(&absolute_path) if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
.await // Non-fatal error: we will just leave the file behind but not
.fatal_err("delete temp file"); // try and load it.
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
}
continue; continue;
} }
@@ -283,9 +290,7 @@ impl ListWriter {
for s in seqs { for s in seqs {
let list_path = self.conf.deletion_list_path(s); let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path) let list_bytes = tokio::fs::read(&list_path).await?;
.await
.fatal_err("read deletion list");
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) { let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l, Ok(l) => l,
@@ -344,7 +349,7 @@ impl ListWriter {
info!("Started deletion frontend worker"); info!("Started deletion frontend worker");
// Synchronous, but we only do it once per process lifetime so it's tolerable // Synchronous, but we only do it once per process lifetime so it's tolerable
if let Err(e) = create_dir_all(self.conf.deletion_prefix()) { if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
tracing::error!( tracing::error!(
"Failed to create deletion list directory {}, deletions will not be executed ({e})", "Failed to create deletion list directory {}, deletions will not be executed ({e})",
self.conf.deletion_prefix(), self.conf.deletion_prefix(),

View File

@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError; use crate::control_plane_client::RetryForeverError;
use crate::metrics; use crate::metrics;
use crate::virtual_file::MaybeFatalIo;
use super::deleter::DeleterMessage; use super::deleter::DeleterMessage;
use super::DeletionHeader; use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) { async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
for list_path in list_paths { for list_path in list_paths {
debug!("Removing deletion list {list_path}"); debug!("Removing deletion list {list_path}");
tokio::fs::remove_file(&list_path)
.await if let Err(e) = tokio::fs::remove_file(&list_path).await {
.fatal_err("remove deletion list"); // Unexpected: we should have permissions and nothing else should
// be touching these files. We will leave the file behind. Subsequent
// pageservers will try and load it again: hopefully whatever storage
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {list_path}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
break;
}
} }
} }

View File

@@ -60,27 +60,47 @@ use utils::serde_percent::Percent;
use crate::{ use crate::{
config::PageServerConf, config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{ tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
self,
storage_layer::{AsLayerDesc, EvictionError, Layer},
Timeline,
},
}; };
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DiskUsageEvictionTaskConfig { pub struct DiskUsageEvictionTaskConfig {
pub max_usage_pct: Percent, pub max_usage_pct: Percent,
pub min_avail_bytes: u64, pub min_avail_bytes: u64,
// Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
// we will keep evicting layers until we reach the target. The resulting disk usage should look
// like a sawtooth bouncing between the upper max/min line and the lower target line.
#[serde(default)]
pub target_usage_pct: Option<Percent>,
#[serde(default)]
pub target_avail_bytes: Option<u64>,
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub period: Duration, pub period: Duration,
#[cfg(feature = "testing")] #[cfg(feature = "testing")]
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>, pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
} }
#[derive(Default)]
enum Status {
/// We are within disk limits, and not currently doing any eviction
#[default]
Idle,
/// Disk limits have been exceeded: we will evict soon
UnderPressure,
/// We are currently doing an eviction pass.
Evicting,
}
#[derive(Default)] #[derive(Default)]
pub struct State { pub struct State {
/// Exclude http requests and background task from running at the same time. /// Exclude http requests and background task from running at the same time.
mutex: tokio::sync::Mutex<()>, mutex: tokio::sync::Mutex<()>,
/// Publish the current status of eviction work, for visibility to other subsystems
/// that modify their behavior if disk pressure is high or if eviction is going on.
status: std::sync::RwLock<Status>,
} }
pub fn launch_disk_usage_global_eviction_task( pub fn launch_disk_usage_global_eviction_task(
@@ -112,7 +132,7 @@ pub fn launch_disk_usage_global_eviction_task(
_ = background_jobs_barrier.wait() => { } _ = background_jobs_barrier.wait() => { }
}; };
disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel) disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
.await; .await;
Ok(()) Ok(())
}, },
@@ -125,7 +145,7 @@ pub fn launch_disk_usage_global_eviction_task(
async fn disk_usage_eviction_task( async fn disk_usage_eviction_task(
state: &State, state: &State,
task_config: &DiskUsageEvictionTaskConfig, task_config: &DiskUsageEvictionTaskConfig,
_storage: &GenericRemoteStorage, storage: GenericRemoteStorage,
tenants_dir: &Utf8Path, tenants_dir: &Utf8Path,
cancel: CancellationToken, cancel: CancellationToken,
) { ) {
@@ -149,8 +169,14 @@ async fn disk_usage_eviction_task(
let start = Instant::now(); let start = Instant::now();
async { async {
let res = let res = disk_usage_eviction_task_iteration(
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await; state,
task_config,
&storage,
tenants_dir,
&cancel,
)
.await;
match res { match res {
Ok(()) => {} Ok(()) => {}
@@ -174,25 +200,34 @@ async fn disk_usage_eviction_task(
} }
pub trait Usage: Clone + Copy + std::fmt::Debug { pub trait Usage: Clone + Copy + std::fmt::Debug {
fn has_pressure(&self) -> bool; fn pressure(&self) -> f64;
fn over_pressure(&self) -> bool;
fn no_pressure(&self) -> bool;
fn add_available_bytes(&mut self, bytes: u64); fn add_available_bytes(&mut self, bytes: u64);
} }
async fn disk_usage_eviction_task_iteration( async fn disk_usage_eviction_task_iteration(
state: &State, state: &State,
task_config: &DiskUsageEvictionTaskConfig, task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path, tenants_dir: &Utf8Path,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?; .context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
if usage_pre.over_pressure() {
*state.status.write().unwrap() = Status::Evicting;
}
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
match res { match res {
Ok(outcome) => { Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished"); debug!(?outcome, "disk_usage_eviction_iteration finished");
match outcome { let new_status = match outcome {
IterationOutcome::NoPressure | IterationOutcome::Cancelled => { IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
// nothing to do, select statement below will handle things // nothing to do, select statement below will handle things
Status::Idle
} }
IterationOutcome::Finished(outcome) => { IterationOutcome::Finished(outcome) => {
// Verify with statvfs whether we made any real progress // Verify with statvfs whether we made any real progress
@@ -202,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(
debug!(?after, "disk usage"); debug!(?after, "disk usage");
if after.has_pressure() { if after.over_pressure() {
// Don't bother doing an out-of-order iteration here now. // Don't bother doing an out-of-order iteration here now.
// In practice, the task period is set to a value in the tens-of-seconds range, // In practice, the task period is set to a value in the tens-of-seconds range,
// which will cause another iteration to happen soon enough. // which will cause another iteration to happen soon enough.
// TODO: deltas between the three different usages would be helpful, // TODO: deltas between the three different usages would be helpful,
// consider MiB, GiB, TiB // consider MiB, GiB, TiB
warn!(?outcome, ?after, "disk usage still high"); warn!(?outcome, ?after, "disk usage still high");
Status::UnderPressure
} else { } else {
info!(?outcome, ?after, "disk usage pressure relieved"); info!(?outcome, ?after, "disk usage pressure relieved");
Status::Idle
} }
} }
} };
*state.status.write().unwrap() = new_status;
} }
Err(e) => { Err(e) => {
error!("disk_usage_eviction_iteration failed: {:#}", e); error!("disk_usage_eviction_iteration failed: {:#}", e);
*state.status.write().unwrap() = if usage_pre.over_pressure() {
Status::UnderPressure
} else {
Status::Idle
};
} }
} }
@@ -270,6 +314,7 @@ struct LayerCount {
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>( pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State, state: &State,
storage: &GenericRemoteStorage,
usage_pre: U, usage_pre: U,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> { ) -> anyhow::Result<IterationOutcome<U>> {
@@ -281,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
debug!(?usage_pre, "disk usage"); debug!(?usage_pre, "disk usage");
if !usage_pre.has_pressure() { if !usage_pre.over_pressure() {
return Ok(IterationOutcome::NoPressure); return Ok(IterationOutcome::NoPressure);
} else {
*state.status.write().unwrap() = Status::Evicting;
} }
warn!( warn!(
@@ -326,12 +373,11 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// If we get far enough in the list that we start to evict layers that are below // If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk // the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'. // usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<_>> = HashMap::new(); let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
let mut warned = None; let mut warned = None;
let mut usage_planned = usage_pre; let mut usage_planned = usage_pre;
let mut max_batch_size = 0;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() { for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() { if usage_planned.no_pressure() {
debug!( debug!(
no_candidates_evicted = i, no_candidates_evicted = i,
"took enough candidates for pressure to be relieved" "took enough candidates for pressure to be relieved"
@@ -346,18 +392,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size); usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn batched
// tasks to evict all seen layers until we have evicted enough .entry(TimelineKey(candidate.timeline))
.or_default()
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default(); .push(candidate.layer);
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
} }
let usage_planned = match warned { let usage_planned = match warned {
@@ -374,101 +412,69 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// phase2: evict victims batched by timeline // phase2: evict victims batched by timeline
let mut js = tokio::task::JoinSet::new(); // After the loop, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
// ratelimit to 1k files or any higher max batch size let mut usage_assumed = usage_pre;
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size))); let mut evictions_failed = LayerCount::default();
for (timeline, batch) in batched { for (timeline, batch) in batched {
let tenant_id = timeline.tenant_id; let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id; let timeline_id = timeline.timeline_id;
let batch_size = let batch_size = batch.len();
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
// I dislike naming of `available_permits` but it means current total amount of permits
// because permits can be added
assert!(batch_size as usize <= limit.available_permits());
debug!(%timeline_id, "evicting batch for timeline"); debug!(%timeline_id, "evicting batch for timeline");
let evict = { async {
let limit = limit.clone(); let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
let cancel = cancel.clone();
async move {
let mut evicted_bytes = 0;
let mut evictions_failed = LayerCount::default();
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else { match results {
// semaphore closing means cancelled Err(e) => {
return (evicted_bytes, evictions_failed); warn!("failed to evict batch: {:#}", e);
}; }
Ok(results) => {
let results = timeline.evict_layers(&batch).await; assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
match results { let file_size = layer.layer_desc().file_size;
Ok(results) => { match result {
assert_eq!(results.len(), batch.len()); Some(Ok(())) => {
for (result, layer) in results.into_iter().zip(batch.iter()) { usage_assumed.add_available_bytes(file_size);
let file_size = layer.layer_desc().file_size; }
match result { Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
Some(Ok(())) => { unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
evicted_bytes += file_size; }
} Some(Err(EvictionError::FileNotFound)) => {
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { evictions_failed.file_sizes += file_size;
evictions_failed.file_sizes += file_size; evictions_failed.count += 1;
evictions_failed.count += 1; }
} Some(Err(
None => { e @ EvictionError::LayerNotFound(_)
assert!(cancel.is_cancelled()); | e @ EvictionError::StatFailed(_),
} )) => {
let e = utils::error::report_compact_sources(&e);
warn!(%layer, "failed to evict layer: {e}");
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
Some(Err(EvictionError::MetadataInconsistency(detail))) => {
warn!(%layer, "failed to evict layer: {detail}");
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
return;
} }
} }
} }
Err(e) => {
warn!("failed to evict batch: {:#}", e);
}
} }
(evicted_bytes, evictions_failed)
} }
} }
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)); .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
.await;
js.spawn(evict); if cancel.is_cancelled() {
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
// chance of making progress
tokio::task::yield_now().await;
}
let join_all = async move {
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
while let Some(res) = js.join_next().await {
match res {
Ok((evicted_bytes, failed)) => {
usage_assumed.add_available_bytes(evicted_bytes);
evictions_failed.file_sizes += failed.file_sizes;
evictions_failed.count += failed.count;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
(usage_assumed, evictions_failed)
};
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = join_all => { tuple },
_ = cancel.cancelled() => {
// close the semaphore to stop any pending acquires
limit.close();
return Ok(IterationOutcome::Cancelled); return Ok(IterationOutcome::Cancelled);
} }
}; }
Ok(IterationOutcome::Finished(IterationOutcomeFinished { Ok(IterationOutcome::Finished(IterationOutcomeFinished {
before: usage_pre, before: usage_pre,
@@ -483,7 +489,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
#[derive(Clone)] #[derive(Clone)]
struct EvictionCandidate { struct EvictionCandidate {
timeline: Arc<Timeline>, timeline: Arc<Timeline>,
layer: Layer, layer: Arc<dyn PersistentLayer>,
last_activity_ts: SystemTime, last_activity_ts: SystemTime,
} }
@@ -545,7 +551,7 @@ async fn collect_eviction_candidates(
if cancel.is_cancelled() { if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled); return Ok(EvictionCandidates::Cancelled);
} }
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) { let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
Ok(tenant) => tenant, Ok(tenant) => tenant,
Err(e) => { Err(e) => {
// this can happen if tenant has lifecycle transition after we fetched it // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,11 +560,6 @@ async fn collect_eviction_candidates(
} }
}; };
if tenant.cancel.is_cancelled() {
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
continue;
}
// collect layers from all timelines in this tenant // collect layers from all timelines in this tenant
// //
// If one of the timelines becomes `!is_active()` during the iteration, // If one of the timelines becomes `!is_active()` during the iteration,
@@ -686,22 +687,57 @@ mod filesystem_level_usage {
} }
impl super::Usage for Usage<'_> { impl super::Usage for Usage<'_> {
fn has_pressure(&self) -> bool { /// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
let usage_pct = ///
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; /// This is the condition for starting eviction.
fn over_pressure(&self) -> bool {
self.pressure() >= 1.0
}
let pressures = [ /// Is the pressure <0, ie.. has disk usage gone below the target bound?
( ///
"min_avail_bytes", /// This is the condition for dropping out of eviction.
self.avail_bytes < self.config.min_avail_bytes, fn no_pressure(&self) -> bool {
), self.pressure() <= 0.0
( }
"max_usage_pct",
usage_pct >= self.config.max_usage_pct.get() as u64,
),
];
pressures.into_iter().any(|(_, has_pressure)| has_pressure) fn pressure(&self) -> f64 {
let max_usage = std::cmp::min(
self.total_bytes - self.config.min_avail_bytes,
(self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
);
let mut target_usage = max_usage;
if let Some(target_avail_bytes) = self.config.target_avail_bytes {
target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
}
if let Some(target_usage_pct) = self.config.target_usage_pct {
target_usage = std::cmp::min(
target_usage,
(self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
);
};
let usage = self.total_bytes - self.avail_bytes;
eprintln!(
"pressure: {} {}, current {}",
target_usage, max_usage, usage
);
if target_usage == max_usage {
// We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
if usage >= max_usage {
1.0
} else {
0.0
}
} else if usage <= target_usage {
// No pressure.
0.0
} else {
// We are above target: pressure is the ratio of how much we exceed target to the size of the gap
let range_size = (max_usage - target_usage) as f64;
(usage - target_usage) as f64 / range_size
}
} }
fn add_available_bytes(&mut self, bytes: u64) { fn add_available_bytes(&mut self, bytes: u64) {
@@ -755,6 +791,8 @@ mod filesystem_level_usage {
config: &DiskUsageEvictionTaskConfig { config: &DiskUsageEvictionTaskConfig {
max_usage_pct: Percent::new(85).unwrap(), max_usage_pct: Percent::new(85).unwrap(),
min_avail_bytes: 0, min_avail_bytes: 0,
target_avail_bytes: None,
target_usage_pct: None,
period: Duration::MAX, period: Duration::MAX,
#[cfg(feature = "testing")] #[cfg(feature = "testing")]
mock_statvfs: None, mock_statvfs: None,
@@ -763,24 +801,24 @@ mod filesystem_level_usage {
avail_bytes: 0, avail_bytes: 0,
}; };
assert!(usage.has_pressure(), "expected pressure at 100%"); assert!(usage.over_pressure(), "expected pressure at 100%");
usage.add_available_bytes(14_000); usage.add_available_bytes(14_000);
assert!(usage.has_pressure(), "expected pressure at 86%"); assert!(usage.over_pressure(), "expected pressure at 86%");
usage.add_available_bytes(999); usage.add_available_bytes(999);
assert!(usage.has_pressure(), "expected pressure at 85.001%"); assert!(usage.over_pressure(), "expected pressure at 85.001%");
usage.add_available_bytes(1); usage.add_available_bytes(1);
assert!(usage.has_pressure(), "expected pressure at precisely 85%"); assert!(usage.over_pressure(), "expected pressure at precisely 85%");
usage.add_available_bytes(1); usage.add_available_bytes(1);
assert!(!usage.has_pressure(), "no pressure at 84.999%"); assert!(!usage.over_pressure(), "no pressure at 84.999%");
usage.add_available_bytes(999); usage.add_available_bytes(999);
assert!(!usage.has_pressure(), "no pressure at 84%"); assert!(!usage.over_pressure(), "no pressure at 84%");
usage.add_available_bytes(16_000); usage.add_available_bytes(16_000);
assert!(!usage.has_pressure()); assert!(!usage.over_pressure());
} }
} }

View File

@@ -52,31 +52,6 @@ paths:
schema: schema:
type: object type: object
/v1/reload_auth_validation_keys:
post:
description: Reloads the JWT public keys from their pre-configured location on disk.
responses:
"200":
description: The reload completed successfully.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error (also hits if no keys were found)
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}: /v1/tenant/{tenant_id}:
parameters: parameters:
- name: tenant_id - name: tenant_id
@@ -352,8 +327,7 @@ paths:
in: query in: query
required: true required: true
schema: schema:
type: string type: integer
format: hex
description: A LSN to get the timestamp description: A LSN to get the timestamp
responses: responses:
"200": "200":
@@ -418,19 +392,13 @@ paths:
type: string type: string
format: date-time format: date-time
description: A timestamp to get the LSN description: A timestamp to get the LSN
- name: version
in: query
required: false
schema:
type: integer
description: The version of the endpoint to use
responses: responses:
"200": "200":
description: OK description: OK
content: content:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/LsnByTimestampResponse" type: string
"400": "400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp description: Error when no tenant id found in path, no timeline id or invalid timestamp
content: content:
@@ -595,17 +563,7 @@ paths:
schema: schema:
$ref: "#/components/schemas/NotFoundError" $ref: "#/components/schemas/NotFoundError"
"409": "409":
description: | description: Tenant download is already in progress
The tenant is already known to Pageserver in some way,
and hence this `/attach` call has been rejected.
Some examples of how this can happen:
- tenant was created on this pageserver
- tenant attachment was started by an earlier call to `/attach`.
Callers should poll the tenant status's `attachment_status` field,
like for status 202. See the longer description for `POST /attach`
for details.
content: content:
application/json: application/json:
schema: schema:
@@ -749,12 +707,6 @@ paths:
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load. Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness. Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/TenantLoadRequest"
responses: responses:
"202": "202":
description: Tenant scheduled to load successfully description: Tenant scheduled to load successfully
@@ -1245,15 +1197,6 @@ components:
new_tenant_id: new_tenant_id:
type: string type: string
format: hex format: hex
generation:
type: integer
description: Attachment generation number.
TenantLoadRequest:
type: object
properties:
generation:
type: integer
description: Attachment generation number.
TenantAttachRequest: TenantAttachRequest:
type: object type: object
required: required:
@@ -1441,19 +1384,6 @@ components:
type: string type: string
format: hex format: hex
LsnByTimestampResponse:
type: object
required:
- lsn
- kind
properties:
lsn:
type: string
format: hex
kind:
type: string
enum: [past, present, future, nodata]
Error: Error:
type: object type: object
required: required:

View File

@@ -8,7 +8,7 @@ use std::sync::Arc;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt; use futures::TryFutureExt;
use humantime::format_rfc3339; use humantime::format_rfc3339;
use hyper::header; use hyper::header::CONTENT_TYPE;
use hyper::StatusCode; use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri}; use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp; use metrics::launch_timestamp::LaunchTimestamp;
@@ -20,7 +20,6 @@ use remote_storage::GenericRemoteStorage;
use tenant_size_model::{SizeResult, StorageModel}; use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::*; use tracing::*;
use utils::auth::JwtAuth;
use utils::http::endpoint::request_span; use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body; use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -36,8 +35,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind; use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::{ use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
}; };
use crate::tenant::size::ModelInputs; use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -46,7 +44,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
use crate::{config::PageServerConf, tenant::mgr}; use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant}; use crate::{disk_usage_eviction_task, tenant};
use utils::{ use utils::{
auth::SwappableJwtAuth, auth::JwtAuth,
generation::Generation, generation::Generation,
http::{ http::{
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,8 +62,7 @@ use super::models::ConfigureFailpointsRequest;
pub struct State { pub struct State {
conf: &'static PageServerConf, conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>, auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: Vec<Uri>, allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
@@ -76,8 +73,7 @@ pub struct State {
impl State { impl State {
pub fn new( pub fn new(
conf: &'static PageServerConf, conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>, auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>, disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -89,7 +85,6 @@ impl State {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Ok(Self { Ok(Self {
conf, conf,
tenant_manager,
auth, auth,
allowlist_routes, allowlist_routes,
remote_storage, remote_storage,
@@ -151,59 +146,28 @@ impl From<PageReconstructError> for ApiError {
impl From<TenantMapInsertError> for ApiError { impl From<TenantMapInsertError> for ApiError {
fn from(tmie: TenantMapInsertError) -> ApiError { fn from(tmie: TenantMapInsertError) -> ApiError {
match tmie { match tmie {
TenantMapInsertError::SlotError(e) => e.into(), TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
TenantMapInsertError::SlotUpsertError(e) => e.into(), ApiError::ResourceUnavailable(format!("{tmie}").into())
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
}
TenantMapInsertError::TenantExistsSecondary(id) => {
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
}
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e), TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
} }
} }
} }
impl From<TenantSlotError> for ApiError {
fn from(e: TenantSlotError) -> ApiError {
use TenantSlotError::*;
match e {
NotFound(tenant_id) => {
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
}
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
InProgress => {
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
}
MapState(e) => e.into(),
}
}
}
impl From<TenantSlotUpsertError> for ApiError {
fn from(e: TenantSlotUpsertError) -> ApiError {
use TenantSlotUpsertError::*;
match e {
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
MapState(e) => e.into(),
}
}
}
impl From<TenantMapError> for ApiError {
fn from(e: TenantMapError) -> ApiError {
use TenantMapError::*;
match e {
StillInitializing | ShuttingDown => {
ApiError::ResourceUnavailable(format!("{e}").into())
}
}
}
}
impl From<TenantStateError> for ApiError { impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError { fn from(tse: TenantStateError) -> ApiError {
match tse { match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
TenantStateError::IsStopping(_) => { TenantStateError::IsStopping(_) => {
ApiError::ResourceUnavailable("Tenant is stopping".into()) ApiError::ResourceUnavailable("Tenant is stopping".into())
} }
TenantStateError::SlotError(e) => e.into(), _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
TenantStateError::SlotUpsertError(e) => e.into(),
TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
} }
} }
} }
@@ -224,7 +188,6 @@ impl From<GetTenantError> for ApiError {
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into()) ApiError::ResourceUnavailable("Tenant not yet active".into())
} }
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
} }
} }
} }
@@ -279,9 +242,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
Get(g) => ApiError::from(g), Get(g) => ApiError::from(g),
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()), e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
Timeline(t) => ApiError::from(t), Timeline(t) => ApiError::from(t),
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
SlotError(e) => e.into(),
SlotUpsertError(e) => e.into(),
Other(o) => ApiError::InternalServerError(o), Other(o) => ApiError::InternalServerError(o),
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
} }
@@ -303,7 +263,11 @@ async fn build_timeline_info(
// we're executing this function, we will outlive the timeline on-disk state. // we're executing this function, we will outlive the timeline on-disk state.
info.current_logical_size_non_incremental = Some( info.current_logical_size_non_incremental = Some(
timeline timeline
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx) .get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
ctx,
)
.await?, .await?,
); );
} }
@@ -389,32 +353,6 @@ async fn status_handler(
json_response(StatusCode::OK, StatusResponse { id: config.id }) json_response(StatusCode::OK, StatusResponse { id: config.id })
} }
async fn reload_auth_validation_keys_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let config = get_config(&request);
let state = get_state(&request);
let Some(shared_auth) = &state.auth else {
return json_response(StatusCode::BAD_REQUEST, ());
};
// unwrap is ok because check is performed when creating config, so path is set and exists
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
match JwtAuth::from_key_path(key_path) {
Ok(new_auth) => {
shared_auth.swap(new_auth);
json_response(StatusCode::OK, ())
}
Err(e) => {
warn!("Error reloading public keys from {key_path:?}: {e:}");
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
}
}
}
async fn timeline_create_handler( async fn timeline_create_handler(
mut request: Request<Body>, mut request: Request<Body>,
_cancel: CancellationToken, _cancel: CancellationToken,
@@ -430,7 +368,7 @@ async fn timeline_create_handler(
let state = get_state(&request); let state = get_state(&request);
async { async {
let tenant = mgr::get_tenant(tenant_id, true)?; let tenant = mgr::get_tenant(tenant_id, true).await?;
match tenant.create_timeline( match tenant.create_timeline(
new_timeline_id, new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from), request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -458,9 +396,6 @@ async fn timeline_create_handler(
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => { Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string())) json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
} }
Err(tenant::CreateTimelineError::ShuttingDown) => {
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
}
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
} }
} }
@@ -480,7 +415,7 @@ async fn timeline_list_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async { let response_data = async {
let tenant = mgr::get_tenant(tenant_id, true)?; let tenant = mgr::get_tenant(tenant_id, true).await?;
let timelines = tenant.list_timelines(); let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len()); let mut response_data = Vec::with_capacity(timelines.len());
@@ -519,7 +454,7 @@ async fn timeline_detail_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline_info = async { let timeline_info = async {
let tenant = mgr::get_tenant(tenant_id, true)?; let tenant = mgr::get_tenant(tenant_id, true).await?;
let timeline = tenant let timeline = tenant
.get_timeline(timeline_id, false) .get_timeline(timeline_id, false)
@@ -549,8 +484,6 @@ async fn get_lsn_by_timestamp_handler(
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?; check_permission(&request, Some(tenant_id))?;
let version: Option<u8> = parse_query_param(&request, "version")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let timestamp_raw = must_get_query_param(&request, "timestamp")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?;
let timestamp = humantime::parse_rfc3339(&timestamp_raw) let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -562,30 +495,13 @@ async fn get_lsn_by_timestamp_handler(
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?; let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
if version.unwrap_or(0) > 1 { let result = match result {
#[derive(serde::Serialize)] LsnForTimestamp::Present(lsn) => format!("{lsn}"),
struct Result { LsnForTimestamp::Future(_lsn) => "future".into(),
lsn: Lsn, LsnForTimestamp::Past(_lsn) => "past".into(),
kind: &'static str, LsnForTimestamp::NoData(_lsn) => "nodata".into(),
} };
let (lsn, kind) = match result { json_response(StatusCode::OK, result)
LsnForTimestamp::Present(lsn) => (lsn, "present"),
LsnForTimestamp::Future(lsn) => (lsn, "future"),
LsnForTimestamp::Past(lsn) => (lsn, "past"),
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
};
json_response(StatusCode::OK, Result { lsn, kind })
} else {
// FIXME: this is a temporary crutch not to break backwards compatibility
// See https://github.com/neondatabase/neon/pull/5608
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
};
json_response(StatusCode::OK, result)
}
} }
async fn get_timestamp_of_lsn_handler( async fn get_timestamp_of_lsn_handler(
@@ -775,7 +691,7 @@ async fn tenant_status(
check_permission(&request, Some(tenant_id))?; check_permission(&request, Some(tenant_id))?;
let tenant_info = async { let tenant_info = async {
let tenant = mgr::get_tenant(tenant_id, false)?; let tenant = mgr::get_tenant(tenant_id, false).await?;
// Calculate total physical size of all timelines // Calculate total physical size of all timelines
let mut current_physical_size = 0; let mut current_physical_size = 0;
@@ -838,7 +754,7 @@ async fn tenant_size_handler(
let headers = request.headers(); let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_id, true)?; let tenant = mgr::get_tenant(tenant_id, true).await?;
// this can be long operation // this can be long operation
let inputs = tenant let inputs = tenant
@@ -851,10 +767,6 @@ async fn tenant_size_handler(
.map_err(ApiError::InternalServerError)?; .map_err(ApiError::InternalServerError)?;
let mut sizes = None; let mut sizes = None;
let accepts_html = headers
.get(header::ACCEPT)
.map(|v| v == "text/html")
.unwrap_or_default();
if !inputs_only.unwrap_or(false) { if !inputs_only.unwrap_or(false) {
let storage_model = inputs let storage_model = inputs
.calculate_model() .calculate_model()
@@ -862,19 +774,21 @@ async fn tenant_size_handler(
let size = storage_model.calculate(); let size = storage_model.calculate();
// If request header expects html, return html // If request header expects html, return html
if accepts_html { if headers["Accept"] == "text/html" {
return synthetic_size_html_response(inputs, storage_model, size); return synthetic_size_html_response(inputs, storage_model, size);
} }
sizes = Some(size); sizes = Some(size);
} else if accepts_html { } else if headers["Accept"] == "text/html" {
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"inputs_only parameter is incompatible with html output request" "inputs_only parameter is incompatible with html output request"
))); )));
} }
/// The type resides in the pageserver not to expose `ModelInputs`. /// The type resides in the pageserver not to expose `ModelInputs`.
#[serde_with::serde_as]
#[derive(serde::Serialize)] #[derive(serde::Serialize)]
struct TenantHistorySize { struct TenantHistorySize {
#[serde_as(as = "serde_with::DisplayFromStr")]
id: TenantId, id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes. /// Size is a mixture of WAL and logical size, so the unit is bytes.
/// ///
@@ -1015,7 +929,7 @@ fn synthetic_size_html_response(
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> { pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
let response = Response::builder() let response = Response::builder()
.status(status) .status(status)
.header(header::CONTENT_TYPE, "text/html") .header(hyper::header::CONTENT_TYPE, "text/html")
.body(Body::from(data.as_bytes().to_vec())) .body(Body::from(data.as_bytes().to_vec()))
.map_err(|e| ApiError::InternalServerError(e.into()))?; .map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response) Ok(response)
@@ -1095,7 +1009,7 @@ async fn get_tenant_config_handler(
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?; check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, false)?; let tenant = mgr::get_tenant(tenant_id, false).await?;
let response = HashMap::from([ let response = HashMap::from([
( (
@@ -1154,7 +1068,7 @@ async fn put_tenant_location_config_handler(
.await .await
{ {
match e { match e {
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { TenantStateError::NotFound(_) => {
// This API is idempotent: a NotFound on a detach is fine. // This API is idempotent: a NotFound on a detach is fine.
} }
_ => return Err(e.into()), _ => return Err(e.into()),
@@ -1166,14 +1080,20 @@ async fn put_tenant_location_config_handler(
let location_conf = let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
state mgr::upsert_location(
.tenant_manager state.conf,
.upsert_location(tenant_id, location_conf, &ctx) tenant_id,
.await location_conf,
// TODO: badrequest assumes the caller was asking for something unreasonable, but in state.broker_client.clone(),
// principle we might have hit something like concurrent API calls to the same tenant, state.remote_storage.clone(),
// which is not a 400 but a 409. state.deletion_queue_client.clone(),
.map_err(ApiError::BadRequest)?; &ctx,
)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
@@ -1186,6 +1106,7 @@ async fn handle_tenant_break(
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.await
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await; tenant.set_broken("broken from test".to_owned()).await;
@@ -1258,7 +1179,7 @@ async fn timeline_compact_handler(
timeline timeline
.compact(&cancel, &ctx) .compact(&cancel, &ctx)
.await .await
.map_err(|e| ApiError::InternalServerError(e.into()))?; .map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id)) .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1283,7 +1204,7 @@ async fn timeline_checkpoint_handler(
timeline timeline
.compact(&cancel, &ctx) .compact(&cancel, &ctx)
.await .await
.map_err(|e| ApiError::InternalServerError(e.into()))?; .map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
@@ -1389,7 +1310,7 @@ async fn getpage_at_lsn_handler(
Result::<_, ApiError>::Ok( Result::<_, ApiError>::Ok(
Response::builder() Response::builder()
.status(StatusCode::OK) .status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream") .header(CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page)) .body(hyper::Body::from(page))
.unwrap(), .unwrap(),
) )
@@ -1490,7 +1411,7 @@ async fn active_timeline_of_active_tenant(
tenant_id: TenantId, tenant_id: TenantId,
timeline_id: TimelineId, timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> { ) -> Result<Arc<Timeline>, ApiError> {
let tenant = mgr::get_tenant(tenant_id, true)?; let tenant = mgr::get_tenant(tenant_id, true).await?;
tenant tenant
.get_timeline(timeline_id, true) .get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into())) .map_err(|e| ApiError::NotFound(e.into()))
@@ -1531,10 +1452,22 @@ async fn disk_usage_eviction_run(
} }
impl crate::disk_usage_eviction_task::Usage for Usage { impl crate::disk_usage_eviction_task::Usage for Usage {
fn has_pressure(&self) -> bool { fn over_pressure(&self) -> bool {
self.config.evict_bytes > self.freed_bytes self.config.evict_bytes > self.freed_bytes
} }
fn no_pressure(&self) -> bool {
!self.over_pressure()
}
fn pressure(&self) -> f64 {
if self.over_pressure() {
1.0
} else {
0.0
}
}
fn add_available_bytes(&mut self, bytes: u64) { fn add_available_bytes(&mut self, bytes: u64) {
self.freed_bytes += bytes; self.freed_bytes += bytes;
} }
@@ -1553,11 +1486,11 @@ async fn disk_usage_eviction_run(
let state = get_state(&r); let state = get_state(&r);
if state.remote_storage.as_ref().is_none() { let Some(storage) = state.remote_storage.clone() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!( return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration" "remote storage not configured, cannot run eviction iteration"
))); )));
} };
let state = state.disk_usage_eviction_state.clone(); let state = state.disk_usage_eviction_state.clone();
@@ -1575,6 +1508,7 @@ async fn disk_usage_eviction_run(
async move { async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state, &state,
&storage,
usage, usage,
&child_cancel, &child_cancel,
) )
@@ -1667,8 +1601,6 @@ where
); );
match handle.await { match handle.await {
// TODO: never actually return Err from here, always Ok(...) so that we can log
// spanned errors. Call api_error_handler instead and return appropriate Body.
Ok(result) => result, Ok(result) => result,
Err(e) => { Err(e) => {
// The handler task panicked. We have a global panic handler that logs the // The handler task panicked. We have a global panic handler that logs the
@@ -1717,7 +1649,7 @@ where
pub fn make_router( pub fn make_router(
state: Arc<State>, state: Arc<State>,
launch_ts: &'static LaunchTimestamp, launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<JwtAuth>>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> { ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml"); let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1746,9 +1678,6 @@ pub fn make_router(
.put("/v1/failpoints", |r| { .put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler) testing_api_handler("manage failpoints", r, failpoints_handler)
}) })
.post("/v1/reload_auth_validation_keys", |r| {
api_handler(r, reload_auth_validation_keys_handler)
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))

View File

@@ -1,5 +1,3 @@
#![deny(clippy::undocumented_unsafe_blocks)]
mod auth; mod auth;
pub mod basebackup; pub mod basebackup;
pub mod config; pub mod config;
@@ -63,6 +61,14 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
) )
.await; .await;
// Shut down any page service tasks.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
.await;
// Shut down all the tenants. This flushes everything to disk and kills // Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks. // the checkpoint and GC tasks.
timed( timed(
@@ -72,15 +78,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
) )
.await; .await;
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
// should already have been canclled via mgr::shutdown_all_tenants
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects // Best effort to persist any outstanding deletions, to avoid leaking objects
if let Some(mut deletion_queue) = deletion_queue { if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await; deletion_queue.shutdown(Duration::from_secs(5)).await;
@@ -152,10 +149,6 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
} }
} }
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
// from the name.
pub fn is_uninit_mark(path: &Utf8Path) -> bool { pub fn is_uninit_mark(path: &Utf8Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
} }

View File

@@ -962,32 +962,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
.expect("failed to define a metric") .expect("failed to define a metric")
}); });
pub(crate) struct TenantManagerMetrics {
pub(crate) tenant_slots: UIntGauge,
pub(crate) tenant_slot_writes: IntCounter,
pub(crate) unexpected_errors: IntCounter,
}
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
TenantManagerMetrics {
tenant_slots: register_uint_gauge!(
"pageserver_tenant_manager_slots",
"How many slots currently exist, including all attached, secondary and in-progress operations",
)
.expect("failed to define a metric"),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
});
pub(crate) struct DeletionQueueMetrics { pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter, pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter, pub(crate) keys_dropped: IntCounter,
@@ -1225,6 +1199,15 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric") .expect("failed to define a metric")
}); });
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| { pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!( register_histogram!(
"pageserver_wal_redo_records_histogram", "pageserver_wal_redo_records_histogram",
@@ -1405,23 +1388,28 @@ impl TimelineMetrics {
} }
} }
pub(crate) fn record_new_file_metrics(&self, sz: u64) { pub fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_add(sz); self.resident_physical_size_add(sz);
self.num_persistent_files_created.inc_by(1); self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz); self.persistent_bytes_written.inc_by(sz);
} }
pub(crate) fn resident_physical_size_sub(&self, sz: u64) { pub fn resident_physical_size_sub(&self, sz: u64) {
self.resident_physical_size_gauge.sub(sz); self.resident_physical_size_gauge.sub(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz); crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
} }
pub(crate) fn resident_physical_size_add(&self, sz: u64) { pub fn resident_physical_size_add(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz); self.resident_physical_size_gauge.add(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz); crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
} }
pub(crate) fn resident_physical_size_get(&self) -> u64 { pub fn resident_physical_size_set(&self, sz: u64) {
self.resident_physical_size_gauge.set(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
}
pub fn resident_physical_size_get(&self) -> u64 {
self.resident_physical_size_gauge.get() self.resident_physical_size_gauge.get()
} }
} }
@@ -1901,9 +1889,6 @@ pub fn preinitialize_metrics() {
// Deletion queue stats // Deletion queue stats
Lazy::force(&DELETION_QUEUE); Lazy::force(&DELETION_QUEUE);
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);
// countervecs // countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter() .into_iter()
@@ -1919,6 +1904,7 @@ pub fn preinitialize_metrics() {
&READ_NUM_FS_LAYERS, &READ_NUM_FS_LAYERS,
&WAIT_LSN_TIME, &WAIT_LSN_TIME,
&WAL_REDO_TIME, &WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
&WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM,
] ]

View File

@@ -40,7 +40,7 @@ use tracing::field;
use tracing::*; use tracing::*;
use utils::id::ConnectionId; use utils::id::ConnectionId;
use utils::{ use utils::{
auth::{Claims, Scope, SwappableJwtAuth}, auth::{Claims, JwtAuth, Scope},
id::{TenantId, TimelineId}, id::{TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
simple_rcu::RcuReadGuard, simple_rcu::RcuReadGuard,
@@ -55,20 +55,16 @@ use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::task_mgr; use crate::task_mgr;
use crate::task_mgr::TaskKind; use crate::task_mgr::TaskKind;
use crate::tenant;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::mgr; use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::{Tenant, Timeline};
use crate::tenant::Timeline;
use crate::trace::Tracer; use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ; use postgres_ffi::BLCKSZ;
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
/// Read the end of a tar archive. /// Read the end of a tar archive.
/// ///
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -122,7 +118,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
pub async fn libpq_listener_main( pub async fn libpq_listener_main(
conf: &'static PageServerConf, conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<JwtAuth>>,
listener: TcpListener, listener: TcpListener,
auth_type: AuthType, auth_type: AuthType,
listener_ctx: RequestContext, listener_ctx: RequestContext,
@@ -190,7 +186,7 @@ pub async fn libpq_listener_main(
async fn page_service_conn_main( async fn page_service_conn_main(
conf: &'static PageServerConf, conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<JwtAuth>>,
socket: tokio::net::TcpStream, socket: tokio::net::TcpStream,
auth_type: AuthType, auth_type: AuthType,
connection_ctx: RequestContext, connection_ctx: RequestContext,
@@ -218,34 +214,22 @@ async fn page_service_conn_main(
// no write timeout is used, because the kernel is assumed to error writes after some time. // no write timeout is used, because the kernel is assumed to error writes after some time.
let mut socket = tokio_io_timeout::TimeoutReader::new(socket); let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default // timeout should be lower, but trying out multiple days for
let socket_timeout_ms = (|| { // <https://github.com/neondatabase/neon/issues/4205>
fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| { socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
// Exponential distribution for simulating
// poor network conditions, expect about avg_timeout_ms to be around 15
// in tests
if let Some(avg_timeout_ms) = avg_timeout_ms {
let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
let u = rand::random::<f32>();
((1.0 - u).ln() / (-avg)) as u64
} else {
default_timeout_ms
}
});
default_timeout_ms
})();
// A timeout here does not mean the client died, it can happen if it's just idle for
// a while: we will tear down this PageServerHandler and instantiate a new one if/when
// they reconnect.
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
let socket = std::pin::pin!(socket); let socket = std::pin::pin!(socket);
// XXX: pgbackend.run() should take the connection_ctx, // XXX: pgbackend.run() should take the connection_ctx,
// and create a child per-query context when it invokes process_query. // and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves. // and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); let mut conn_handler = PageServerHandler::new(
conf,
broker_client,
auth,
connection_ctx,
task_mgr::shutdown_token(),
);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend match pgbackend
@@ -271,7 +255,7 @@ async fn page_service_conn_main(
struct PageServerHandler { struct PageServerHandler {
_conf: &'static PageServerConf, _conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<JwtAuth>>,
claims: Option<Claims>, claims: Option<Claims>,
/// The context created for the lifetime of the connection /// The context created for the lifetime of the connection
@@ -279,14 +263,19 @@ struct PageServerHandler {
/// For each query received over the connection, /// For each query received over the connection,
/// `process_query` creates a child context from this one. /// `process_query` creates a child context from this one.
connection_ctx: RequestContext, connection_ctx: RequestContext,
/// A token that should fire when the tenant transitions from
/// attached state, or when the pageserver is shutting down.
cancel: CancellationToken,
} }
impl PageServerHandler { impl PageServerHandler {
pub fn new( pub fn new(
conf: &'static PageServerConf, conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<JwtAuth>>,
connection_ctx: RequestContext, connection_ctx: RequestContext,
cancel: CancellationToken,
) -> Self { ) -> Self {
PageServerHandler { PageServerHandler {
_conf: conf, _conf: conf,
@@ -294,6 +283,7 @@ impl PageServerHandler {
auth, auth,
claims: None, claims: None,
connection_ctx, connection_ctx,
cancel,
} }
} }
@@ -301,11 +291,7 @@ impl PageServerHandler {
/// this rather than naked flush() in order to shut down promptly. Without this, we would /// this rather than naked flush() in order to shut down promptly. Without this, we would
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
/// in the flush. /// in the flush.
async fn flush_cancellable<IO>( async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
&self,
pgb: &mut PostgresBackend<IO>,
cancel: &CancellationToken,
) -> Result<(), QueryError>
where where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{ {
@@ -313,7 +299,7 @@ impl PageServerHandler {
flush_r = pgb.flush() => { flush_r = pgb.flush() => {
Ok(flush_r?) Ok(flush_r?)
}, },
_ = cancel.cancelled() => { _ = self.cancel.cancelled() => {
Err(QueryError::Shutdown) Err(QueryError::Shutdown)
} }
) )
@@ -322,7 +308,6 @@ impl PageServerHandler {
fn copyin_stream<'a, IO>( fn copyin_stream<'a, IO>(
&'a self, &'a self,
pgb: &'a mut PostgresBackend<IO>, pgb: &'a mut PostgresBackend<IO>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = io::Result<Bytes>> + 'a ) -> impl Stream<Item = io::Result<Bytes>> + 'a
where where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -332,7 +317,7 @@ impl PageServerHandler {
let msg = tokio::select! { let msg = tokio::select! {
biased; biased;
_ = cancel.cancelled() => { _ = self.cancel.cancelled() => {
// We were requested to shut down. // We were requested to shut down.
let msg = "pageserver is shutting down"; let msg = "pageserver is shutting down";
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -372,7 +357,7 @@ impl PageServerHandler {
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok // error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
} }
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -399,13 +384,12 @@ impl PageServerHandler {
{ {
debug_assert_current_span_has_tenant_and_timeline_id(); debug_assert_current_span_has_tenant_and_timeline_id();
// NOTE: pagerequests handler exits when connection is closed,
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed // Make request tracer if needed
let tenant = mgr::get_active_tenant_with_timeout( let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let mut tracer = if tenant.get_trace_read_requests() { let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate(); let connection_id = ConnectionId::generate();
let path = tenant let path = tenant
@@ -421,14 +405,9 @@ impl PageServerHandler {
.get_timeline(timeline_id, true) .get_timeline(timeline_id, true)
.map_err(|e| anyhow::anyhow!(e))?; .map_err(|e| anyhow::anyhow!(e))?;
// Avoid starting new requests if the timeline has already started shutting down,
// and block timeline shutdown until this request is complete, or drops out due
// to cancellation.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
// switch client to COPYBOTH // switch client to COPYBOTH
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?; self.flush_cancellable(pgb).await?;
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id); let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
@@ -436,7 +415,7 @@ impl PageServerHandler {
let msg = tokio::select! { let msg = tokio::select! {
biased; biased;
_ = timeline.cancel.cancelled() => { _ = self.cancel.cancelled() => {
// We were requested to shut down. // We were requested to shut down.
info!("shutdown request received in page handler"); info!("shutdown request received in page handler");
return Err(QueryError::Shutdown) return Err(QueryError::Shutdown)
@@ -511,24 +490,9 @@ impl PageServerHandler {
} }
}; };
if let Err(e) = &response {
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
return Err(QueryError::Shutdown);
}
}
let response = response.unwrap_or_else(|e| { let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the // print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error // error message is enough
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e)); span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse { PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(), message: e.to_string(),
@@ -536,7 +500,7 @@ impl PageServerHandler {
}); });
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?; pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
self.flush_cancellable(pgb, &timeline.cancel).await?; self.flush_cancellable(pgb).await?;
} }
Ok(()) Ok(())
} }
@@ -558,14 +522,10 @@ impl PageServerHandler {
{ {
debug_assert_current_span_has_tenant_and_timeline_id(); debug_assert_current_span_has_tenant_and_timeline_id();
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline // Create empty timeline
info!("creating new timeline"); info!("creating new timeline");
let tenant = get_active_tenant_with_timeout( let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let timeline = tenant let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.await?; .await?;
@@ -583,9 +543,9 @@ impl PageServerHandler {
// Import basebackup provided via CopyData // Import basebackup provided via CopyData
info!("importing basebackup"); info!("importing basebackup");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb, &tenant.cancel).await?; self.flush_cancellable(pgb).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
timeline timeline
.import_basebackup_from_tar( .import_basebackup_from_tar(
&mut copyin_reader, &mut copyin_reader,
@@ -622,10 +582,9 @@ impl PageServerHandler {
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{ {
debug_assert_current_span_has_tenant_and_timeline_id(); debug_assert_current_span_has_tenant_and_timeline_id();
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
let timeline = self let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let last_record_lsn = timeline.get_last_record_lsn(); let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn { if last_record_lsn != start_lsn {
return Err(QueryError::Other( return Err(QueryError::Other(
@@ -639,8 +598,8 @@ impl PageServerHandler {
// Import wal provided via CopyData // Import wal provided via CopyData
info!("importing wal"); info!("importing wal");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?; self.flush_cancellable(pgb).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel))); let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete"); info!("wal import complete");
@@ -833,9 +792,7 @@ impl PageServerHandler {
let started = std::time::Instant::now(); let started = std::time::Instant::now();
// check that the timeline exists // check that the timeline exists
let timeline = self let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn { if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive. // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -850,7 +807,7 @@ impl PageServerHandler {
// switch client to COPYOUT // switch client to COPYOUT
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?; self.flush_cancellable(pgb).await?;
// Send a tarball of the latest layer on the timeline. Compress if not // Send a tarball of the latest layer on the timeline. Compress if not
// fullbackup. TODO Compress in that case too (tests need to be updated) // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -902,7 +859,7 @@ impl PageServerHandler {
} }
pgb.write_message_noflush(&BeMessage::CopyDone)?; pgb.write_message_noflush(&BeMessage::CopyDone)?;
self.flush_cancellable(pgb, &timeline.cancel).await?; self.flush_cancellable(pgb).await?;
let basebackup_after = started let basebackup_after = started
.elapsed() .elapsed()
@@ -920,7 +877,7 @@ impl PageServerHandler {
// when accessing management api supply None as an argument // when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id // when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> { fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.auth.is_none() { if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok // auth is set to Trust, nothing to check so just return ok
return Ok(()); return Ok(());
@@ -932,26 +889,7 @@ impl PageServerHandler {
.claims .claims
.as_ref() .as_ref()
.expect("claims presence already checked"); .expect("claims presence already checked");
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) check_permission(claims, tenant_id)
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
} }
} }
@@ -971,17 +909,16 @@ where
.auth .auth
.as_ref() .as_ref()
.unwrap() .unwrap()
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?) .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
.map_err(|e| QueryError::Unauthorized(e.0))?;
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Unauthorized( return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing".into(), "jwt token scope is Tenant, but tenant id is missing"
)); )));
} }
debug!( info!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}", "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
data.claims.scope, data.claims.tenant_id, data.claims.scope, data.claims.tenant_id,
); );
@@ -1003,13 +940,9 @@ where
pgb: &mut PostgresBackend<IO>, pgb: &mut PostgresBackend<IO>,
query_string: &str, query_string: &str,
) -> Result<(), QueryError> { ) -> Result<(), QueryError> {
fail::fail_point!("simulated-bad-compute-connection", |_| {
info!("Hit failpoint for bad connection");
Err(QueryError::SimulatedConnectionError)
});
let ctx = self.connection_ctx.attached_child(); let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}"); debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") { if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len()); let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>(); let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1115,9 +1048,7 @@ where
.record("timeline_id", field::display(timeline_id)); .record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?; self.check_permission(Some(tenant_id))?;
let timeline = self let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let end_of_timeline = timeline.get_last_record_rlsn(); let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1301,12 +1232,7 @@ where
self.check_permission(Some(tenant_id))?; self.check_permission(Some(tenant_id))?;
let tenant = get_active_tenant_with_timeout( let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[ pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1352,16 +1278,67 @@ where
} }
} }
#[derive(thiserror::Error, Debug)]
enum GetActiveTenantError {
#[error(
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
)]
WaitForActiveTimeout {
latest_state: TenantState,
wait_time: Duration,
},
#[error(transparent)]
NotFound(GetTenantError),
#[error(transparent)]
WaitTenantActive(tenant::WaitToBecomeActiveError),
}
impl From<GetActiveTenantError> for QueryError { impl From<GetActiveTenantError> for QueryError {
fn from(e: GetActiveTenantError) -> Self { fn from(e: GetActiveTenantError) -> Self {
match e { match e {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
), ),
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
QueryError::Shutdown GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
}
}
}
/// Get active tenant.
///
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(
tenant_id: TenantId,
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let tenant = match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => tenant,
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
Err(GetTenantError::Broken(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
Ok(Ok(())) => Ok(tenant),
// no .context(), the error message is good enough and some tests depend on it
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
Err(_) => {
let latest_state = tenant.current_state();
if latest_state == TenantState::Active {
Ok(tenant)
} else {
Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state,
wait_time,
})
} }
e => QueryError::Other(anyhow::anyhow!(e)),
} }
} }
} }
@@ -1382,3 +1359,18 @@ impl From<GetActiveTimelineError> for QueryError {
} }
} }
} }
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
}

View File

@@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet}; use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow; use std::ops::ControlFlow;
use std::ops::Range; use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn}; use tracing::{debug, trace, warn};
use utils::{bin_ser::BeSer, lsn::Lsn}; use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -43,17 +44,6 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error), Other(#[from] anyhow::Error),
} }
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
Self::Cancelled
}
_ => Self::Other(pre.into()),
}
}
}
#[derive(Debug, thiserror::Error)] #[derive(Debug, thiserror::Error)]
pub enum RelationError { pub enum RelationError {
#[error("Relation Already Exists")] #[error("Relation Already Exists")]
@@ -562,8 +552,7 @@ impl Timeline {
Err(e) => Err(PageReconstructError::from(e)), Err(e) => Err(PageReconstructError::from(e)),
}, },
Err(e) => { Err(e) => {
// This is expected: historical databases do not have the key. warn!("Failed to get info about AUX files: {}", e);
debug!("Failed to get info about AUX files: {}", e);
Ok(HashMap::new()) Ok(HashMap::new())
} }
} }
@@ -577,22 +566,30 @@ impl Timeline {
pub async fn get_current_logical_size_non_incremental( pub async fn get_current_logical_size_non_incremental(
&self, &self,
lsn: Lsn, lsn: Lsn,
cancel: CancellationToken,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> { ) -> Result<u64, CalculateLogicalSizeError> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
// Fetch list of database dirs and iterate them // Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0; let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() { for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? { for rel in self
if self.cancel.is_cancelled() { .list_rels(*spcnode, *dbnode, lsn, ctx)
.await
.context("list rels")?
{
if cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled); return Err(CalculateLogicalSizeError::Cancelled);
} }
let relsize_key = rel_size_to_key(rel); let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn, ctx).await?; let mut buf = self
.get(relsize_key, lsn, ctx)
.await
.with_context(|| format!("read relation size of {rel:?}"))?;
let relsize = buf.get_u32_le(); let relsize = buf.get_u32_le();
total_size += relsize as u64; total_size += relsize as u64;
@@ -678,9 +675,8 @@ impl Timeline {
result.add_key(CONTROLFILE_KEY); result.add_key(CONTROLFILE_KEY);
result.add_key(CHECKPOINT_KEY); result.add_key(CHECKPOINT_KEY);
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { result.add_key(AUX_FILES_KEY);
result.add_key(AUX_FILES_KEY);
}
Ok(result.to_keyspace()) Ok(result.to_keyspace())
} }
@@ -1205,8 +1201,7 @@ impl<'a> DatadirModification<'a> {
let mut dir = match self.get(AUX_FILES_KEY, ctx).await { let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
Ok(buf) => AuxFilesDirectory::des(&buf)?, Ok(buf) => AuxFilesDirectory::des(&buf)?,
Err(e) => { Err(e) => {
// This is expected: historical databases do not have the key. warn!("Failed to get info about AUX files: {}", e);
debug!("Failed to get info about AUX files: {}", e);
AuxFilesDirectory { AuxFilesDirectory {
files: HashMap::new(), files: HashMap::new(),
} }

View File

@@ -299,6 +299,10 @@ pub enum TaskKind {
#[derive(Default)] #[derive(Default)]
struct MutableTaskState { struct MutableTaskState {
/// Tenant and timeline that this task is associated with.
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
/// Handle for waiting for the task to exit. It can be None, if the /// Handle for waiting for the task to exit. It can be None, if the
/// the task has already exited. /// the task has already exited.
join_handle: Option<JoinHandle<()>>, join_handle: Option<JoinHandle<()>>,
@@ -315,11 +319,6 @@ struct PageServerTask {
// To request task shutdown, just cancel this token. // To request task shutdown, just cancel this token.
cancel: CancellationToken, cancel: CancellationToken,
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>, mutable: Mutex<MutableTaskState>,
} }
@@ -345,9 +344,11 @@ where
kind, kind,
name: name.to_string(), name: name.to_string(),
cancel: cancel.clone(), cancel: cancel.clone(),
tenant_id, mutable: Mutex::new(MutableTaskState {
timeline_id, tenant_id,
mutable: Mutex::new(MutableTaskState { join_handle: None }), timeline_id,
join_handle: None,
}),
}); });
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -417,6 +418,8 @@ async fn task_finish(
let mut shutdown_process = false; let mut shutdown_process = false;
{ {
let task_mut = task.mutable.lock().unwrap();
match result { match result {
Ok(Ok(())) => { Ok(Ok(())) => {
debug!("Task '{}' exited normally", task_name); debug!("Task '{}' exited normally", task_name);
@@ -425,13 +428,13 @@ async fn task_finish(
if shutdown_process_on_error { if shutdown_process_on_error {
error!( error!(
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err task_name, task_mut.tenant_id, task_mut.timeline_id, err
); );
shutdown_process = true; shutdown_process = true;
} else { } else {
error!( error!(
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err task_name, task_mut.tenant_id, task_mut.timeline_id, err
); );
} }
} }
@@ -439,13 +442,13 @@ async fn task_finish(
if shutdown_process_on_error { if shutdown_process_on_error {
error!( error!(
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err task_name, task_mut.tenant_id, task_mut.timeline_id, err
); );
shutdown_process = true; shutdown_process = true;
} else { } else {
error!( error!(
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err task_name, task_mut.tenant_id, task_mut.timeline_id, err
); );
} }
} }
@@ -457,6 +460,17 @@ async fn task_finish(
} }
} }
// expected to be called from the task of the given id.
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
CURRENT_TASK.with(|ct| {
let mut task_mut = ct.mutable.lock().unwrap();
task_mut.tenant_id = tenant_id;
task_mut.timeline_id = timeline_id;
});
}
/// Is there a task running that matches the criteria
/// Signal and wait for tasks to shut down. /// Signal and wait for tasks to shut down.
/// ///
/// ///
@@ -479,16 +493,17 @@ pub async fn shutdown_tasks(
{ {
let tasks = TASKS.lock().unwrap(); let tasks = TASKS.lock().unwrap();
for task in tasks.values() { for task in tasks.values() {
let task_mut = task.mutable.lock().unwrap();
if (kind.is_none() || Some(task.kind) == kind) if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_id.is_none() || task.tenant_id == tenant_id) && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id) && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
{ {
task.cancel.cancel(); task.cancel.cancel();
victim_tasks.push(( victim_tasks.push((
Arc::clone(task), Arc::clone(task),
task.kind, task.kind,
task.tenant_id, task_mut.tenant_id,
task.timeline_id, task_mut.timeline_id,
)); ));
} }
} }

File diff suppressed because it is too large Load Diff

View File

@@ -327,7 +327,7 @@ mod tests {
let mut sz: u16 = rng.gen(); let mut sz: u16 = rng.gen();
// Make 50% of the arrays small // Make 50% of the arrays small
if rng.gen() { if rng.gen() {
sz &= 63; sz |= 63;
} }
random_array(sz.into()) random_array(sz.into())
}) })

View File

@@ -3,10 +3,10 @@ use std::sync::Arc;
use anyhow::Context; use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::TenantState; use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemotePath}; use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard; use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, warn, Instrument, Span}; use tracing::{error, info, instrument, warn, Instrument, Span};
use utils::{ use utils::{
backoff, completion, crashsafe, fs_ext, backoff, completion, crashsafe, fs_ext,
@@ -21,33 +21,26 @@ use crate::{
}; };
use super::{ use super::{
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap}, mgr::{GetTenantError, TenantsMap},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span, span,
timeline::delete::DeleteTimelineFlow, timeline::delete::DeleteTimelineFlow,
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload, tree_sort_timelines, DeleteTimelineError, Tenant,
}; };
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
#[derive(Debug, thiserror::Error)] #[derive(Debug, thiserror::Error)]
pub(crate) enum DeleteTenantError { pub(crate) enum DeleteTenantError {
#[error("GetTenant {0}")] #[error("GetTenant {0}")]
Get(#[from] GetTenantError), Get(#[from] GetTenantError),
#[error("Tenant not attached")]
NotAttached,
#[error("Invalid state {0}. Expected Active or Broken")] #[error("Invalid state {0}. Expected Active or Broken")]
InvalidState(TenantState), InvalidState(TenantState),
#[error("Tenant deletion is already in progress")] #[error("Tenant deletion is already in progress")]
AlreadyInProgress, AlreadyInProgress,
#[error("Tenant map slot error {0}")]
SlotError(#[from] TenantSlotError),
#[error("Tenant map slot upsert error {0}")]
SlotUpsertError(#[from] TenantSlotUpsertError),
#[error("Timeline {0}")] #[error("Timeline {0}")]
Timeline(#[from] DeleteTimelineError), Timeline(#[from] DeleteTimelineError),
@@ -67,7 +60,7 @@ fn remote_tenant_delete_mark_path(
.context("Failed to strip workdir prefix") .context("Failed to strip workdir prefix")
.and_then(RemotePath::new) .and_then(RemotePath::new)
.context("tenant path")?; .context("tenant path")?;
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted"))) Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
} }
async fn create_remote_delete_mark( async fn create_remote_delete_mark(
@@ -157,8 +150,7 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
// Assert timelines dir is empty. // Assert timelines dir is empty.
if !fs_ext::is_directory_empty(timelines_path).await? { if !fs_ext::is_directory_empty(timelines_path).await? {
// Display first 10 items in directory // Display first 10 items in directory
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?; let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
let list = &list.into_iter().take(10).collect::<Vec<_>>();
return Err(DeleteTenantError::Other(anyhow::anyhow!( return Err(DeleteTenantError::Other(anyhow::anyhow!(
"Timelines directory is not empty after all timelines deletion: {list:?}" "Timelines directory is not empty after all timelines deletion: {list:?}"
))); )));
@@ -247,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
Ok(()) Ok(())
} }
pub(crate) async fn remote_delete_mark_exists(
conf: &PageServerConf,
tenant_id: &TenantId,
remote_storage: &GenericRemoteStorage,
) -> anyhow::Result<bool> {
// If remote storage is there we rely on it
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
let result = backoff::retry(
|| async { remote_storage.download(&remote_mark_path).await },
|e| matches!(e, DownloadError::NotFound),
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
"fetch_tenant_deletion_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await;
match result {
Ok(_) => Ok(true),
Err(DownloadError::NotFound) => Ok(false),
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
}
}
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures, /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3. /// and deletes its data from both disk and s3.
/// The sequence of steps: /// The sequence of steps:
@@ -258,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
/// 6. Remove remote mark /// 6. Remove remote mark
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
/// It is resumable from any step in case a crash/restart occurs. /// It is resumable from any step in case a crash/restart occurs.
/// There are two entrypoints to the process: /// There are three entrypoints to the process:
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. /// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. /// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
#[derive(Default)] #[derive(Default)]
pub enum DeleteTenantFlow { pub enum DeleteTenantFlow {
@@ -282,12 +301,12 @@ impl DeleteTenantFlow {
pub(crate) async fn run( pub(crate) async fn run(
conf: &'static PageServerConf, conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>, tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>, tenant_id: TenantId,
) -> Result<(), DeleteTenantError> { ) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id(); span::debug_assert_current_span_has_tenant_id();
let mut guard = Self::prepare(&tenant).await?; let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
tenant.set_broken(format!("{e:#}")).await; tenant.set_broken(format!("{e:#}")).await;
@@ -359,7 +378,7 @@ impl DeleteTenantFlow {
pub(crate) async fn should_resume_deletion( pub(crate) async fn should_resume_deletion(
conf: &'static PageServerConf, conf: &'static PageServerConf,
remote_mark_exists: bool, remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant, tenant: &Tenant,
) -> Result<Option<DeletionGuard>, DeleteTenantError> { ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
let acquire = |t: &Tenant| { let acquire = |t: &Tenant| {
@@ -370,25 +389,66 @@ impl DeleteTenantFlow {
) )
}; };
if remote_mark_exists {
return Ok(acquire(tenant));
}
let tenant_id = tenant.tenant_id; let tenant_id = tenant.tenant_id;
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
return Ok(acquire(tenant));
}
let remote_storage = match remote_storage {
Some(remote_storage) => remote_storage,
None => return Ok(None),
};
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
Ok(acquire(tenant)) Ok(acquire(tenant))
} else { } else {
Ok(None) Ok(None)
} }
} }
pub(crate) async fn resume_from_load(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
init_order: Option<&InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
tenant
.set_stopping(progress, true, false)
.await
.expect("cant be stopping or broken");
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
if let Some(background) = background_jobs_can_start {
info!("waiting for backgound jobs barrier");
background.clone().wait().await;
info!("ready for backgound jobs barrier");
}
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
if timelines_path.exists() {
tenant.load(init_order, None, ctx).await.context("load")?;
}
Self::background(
guard,
tenant.conf,
tenant.remote_storage.clone(),
tenants,
tenant,
)
.await
}
pub(crate) async fn resume_from_attach( pub(crate) async fn resume_from_attach(
guard: DeletionGuard, guard: DeletionGuard,
tenant: &Arc<Tenant>, tenant: &Arc<Tenant>,
preload: Option<TenantPreload>, tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenants: &'static std::sync::RwLock<TenantsMap>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<(), DeleteTenantError> { ) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel(); let (_, progress) = completion::channel();
@@ -399,7 +459,7 @@ impl DeleteTenantFlow {
.expect("cant be stopping or broken"); .expect("cant be stopping or broken");
tenant tenant
.attach(init_order, preload, ctx) .attach(ctx, super::AttachMarkerMode::Expect)
.await .await
.context("attach")?; .context("attach")?;
@@ -414,8 +474,15 @@ impl DeleteTenantFlow {
} }
async fn prepare( async fn prepare(
tenant: &Arc<Tenant>, tenants: &tokio::sync::RwLock<TenantsMap>,
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> { tenant_id: TenantId,
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
let m = tenants.read().await;
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
// FIXME: unsure about active only. Our init jobs may not be cancellable properly, // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
// so at least for now allow deletions only for active tenants. TODO recheck // so at least for now allow deletions only for active tenants. TODO recheck
// Broken and Stopping is needed for retries. // Broken and Stopping is needed for retries.
@@ -449,14 +516,14 @@ impl DeleteTenantFlow {
))); )));
} }
Ok(guard) Ok((Arc::clone(tenant), guard))
} }
fn schedule_background( fn schedule_background(
guard: OwnedMutexGuard<Self>, guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf, conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>, tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>, tenant: Arc<Tenant>,
) { ) {
let tenant_id = tenant.tenant_id; let tenant_id = tenant.tenant_id;
@@ -489,7 +556,7 @@ impl DeleteTenantFlow {
mut guard: OwnedMutexGuard<Self>, mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf, conf: &PageServerConf,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>, tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>, tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> { ) -> Result<(), DeleteTenantError> {
// Tree sort timelines, schedule delete for them. Mention retries from the console side. // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -537,18 +604,10 @@ impl DeleteTenantFlow {
.await .await
.context("cleanup_remaining_fs_traces")?; .context("cleanup_remaining_fs_traces")?;
{ let mut locked = tenants.write().await;
let mut locked = tenants.write().unwrap(); if locked.remove(&tenant.tenant_id).is_none() {
if locked.remove(&tenant.tenant_id).is_none() { warn!("Tenant got removed from tenants map during deletion");
warn!("Tenant got removed from tenants map during deletion"); };
};
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
}
*guard = Self::Finished; *guard = Self::Finished;

File diff suppressed because it is too large Load Diff

View File

@@ -639,10 +639,147 @@ impl LayerMap {
} }
println!("historic_layers:"); println!("historic_layers:");
for desc in self.iter_historic_layers() { for layer in self.iter_historic_layers() {
desc.dump(); layer.dump(verbose, ctx)?;
} }
println!("End dump LayerMap"); println!("End dump LayerMap");
Ok(()) Ok(())
} }
} }
#[cfg(test)]
mod tests {
use super::LayerMap;
use crate::tenant::storage_layer::LayerFileName;
use std::str::FromStr;
use std::sync::Arc;
mod l0_delta_layers_updated {
use crate::tenant::{
storage_layer::{AsLayerDesc, PersistentLayerDesc},
timeline::layer_manager::LayerFileManager,
};
use super::*;
struct LayerObject(PersistentLayerDesc);
impl AsLayerDesc for LayerObject {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.0
}
}
impl LayerObject {
fn new(desc: PersistentLayerDesc) -> Self {
LayerObject(desc)
}
}
type TestLayerFileManager = LayerFileManager<LayerObject>;
#[test]
fn for_full_range_delta() {
// l0_delta_layers are used by compaction, and should observe all buffered updates
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
true
)
}
#[test]
fn for_non_full_range_delta() {
// has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
// because not full range
false
)
}
#[test]
fn for_image() {
l0_delta_layers_updated_scenario(
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
// code only checks if it is a full range layer, doesn't care about images, which must
// mean we should in practice never have full range images
false
)
}
#[test]
fn replacing_missing_l0_is_notfound() {
// original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
// however only happen for precondition failures.
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
let layer = LayerFileName::from_str(layer).unwrap();
let layer = PersistentLayerDesc::from(layer);
// same skeletan construction; see scenario below
let not_found = Arc::new(LayerObject::new(layer.clone()));
let new_version = Arc::new(LayerObject::new(layer));
// after the immutable storage state refactor, the replace operation
// will not use layer map any more. We keep it here for consistency in test cases
// and can remove it in the future.
let _map = LayerMap::default();
let mut mapping = TestLayerFileManager::new();
mapping
.replace_and_verify(not_found, new_version)
.unwrap_err();
}
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
let name = LayerFileName::from_str(layer_name).unwrap();
let skeleton = PersistentLayerDesc::from(name);
let remote = Arc::new(LayerObject::new(skeleton.clone()));
let downloaded = Arc::new(LayerObject::new(skeleton));
let mut map = LayerMap::default();
let mut mapping = LayerFileManager::new();
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
assert_eq!(remote.layer_desc(), downloaded.layer_desc());
let expected_in_counts = (1, usize::from(expected_l0));
map.batch_update()
.insert_historic(remote.layer_desc().clone());
mapping.insert(remote.clone());
assert_eq!(
count_layer_in(&map, remote.layer_desc()),
expected_in_counts
);
mapping
.replace_and_verify(remote, downloaded.clone())
.expect("name derived attributes are the same");
assert_eq!(
count_layer_in(&map, downloaded.layer_desc()),
expected_in_counts
);
map.batch_update().remove_historic(downloaded.layer_desc());
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
}
fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
let historic = map
.iter_historic_layers()
.filter(|x| x.key() == layer.key())
.count();
let l0s = map
.get_level0_deltas()
.expect("why does this return a result");
let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
(historic, l0)
}
}
}

View File

@@ -406,123 +406,4 @@ mod tests {
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
); );
} }
#[test]
fn test_metadata_bincode_serde() {
let original_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
);
let metadata_bytes = original_metadata
.to_bytes()
.expect("Cannot create bytes array from metadata");
let metadata_bincode_be_bytes = original_metadata
.ser()
.expect("Cannot serialize the metadata");
// 8 bytes for the length of the vector
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
let expected_bincode_bytes = {
let mut temp = vec![];
let len_bytes = metadata_bytes.len().to_be_bytes();
temp.extend_from_slice(&len_bytes);
temp.extend_from_slice(&metadata_bytes);
temp
};
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
// Deserialized metadata has the metadata header, which is different from the serialized one.
// Reference: TimelineMetaData::to_bytes()
let expected_metadata = {
let mut temp_metadata = original_metadata;
let body_bytes = temp_metadata
.body
.ser()
.expect("Cannot serialize the metadata body");
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_FORMAT_VERSION,
checksum: crc32c::crc32c(&body_bytes),
};
temp_metadata.hdr = hdr;
temp_metadata
};
assert_eq!(deserialized_metadata, expected_metadata);
}
#[test]
fn test_metadata_bincode_serde_ensure_roundtrip() {
let original_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
);
let expected_bytes = vec![
/* bincode length encoding bytes */
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
/* TimelineMetadataHeader */
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
/* TimelineMetadataBodyV2 */
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
136, // ancestor_timeline (17 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
0, 0, 0, 15, // pg_version (4 bytes)
/* padding bytes */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
];
let metadata_ser_bytes = original_metadata.ser().unwrap();
assert_eq!(metadata_ser_bytes, expected_bytes);
let expected_metadata = {
let mut temp_metadata = original_metadata;
let body_bytes = temp_metadata
.body
.ser()
.expect("Cannot serialize the metadata body");
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_FORMAT_VERSION,
checksum: crc32c::crc32c(&body_bytes),
};
temp_metadata.hdr = hdr;
temp_metadata
};
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
assert_eq!(des_metadata, expected_metadata);
}
} }

File diff suppressed because it is too large Load Diff

View File

@@ -57,7 +57,8 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
fsync_in_thread_pool(paths) fsync_in_thread_pool(paths)
} }
/// Parallel fsync asynchronously. /// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> { pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
const MAX_CONCURRENT_FSYNC: usize = 64; const MAX_CONCURRENT_FSYNC: usize = 64;
let mut next = paths.iter().peekable(); let mut next = paths.iter().peekable();

View File

@@ -167,15 +167,39 @@
//! - download their remote [`IndexPart`]s //! - download their remote [`IndexPart`]s
//! - create `Timeline` struct and a `RemoteTimelineClient` //! - create `Timeline` struct and a `RemoteTimelineClient`
//! - initialize the client's upload queue with its `IndexPart` //! - initialize the client's upload queue with its `IndexPart`
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
//! for layers that are referenced by `IndexPart` but not present locally
//! - schedule uploads for layers that are only present locally. //! - schedule uploads for layers that are only present locally.
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
//! the local filesystem, write the remote metadata to the local filesystem
//! - After the above is done for each timeline, open the tenant for business by //! - After the above is done for each timeline, open the tenant for business by
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. //! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
//! //!
//! We keep track of the fact that a client is in `Attaching` state in a marker
//! file on the local disk. This is critical because, when we restart the pageserver,
//! we do not want to do the `List timelines` step for each tenant that has already
//! been successfully attached (for performance & cost reasons).
//! Instead, for a tenant without the attach marker file, we assume that the
//! local state is in sync or ahead of the remote state. This includes the list
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
//! if there's a timeline on the remote that the pageserver doesn't know about,
//! the GC will not consider its branch point, leading to data loss.
//! So, for a tenant with the attach marker file, we know that we do not yet have
//! persisted all the remote timeline's metadata files locally. To exclude the
//! risk above, we re-run the procedure for such tenants
//!
//! # Operating Without Remote Storage //! # Operating Without Remote Storage
//! //!
//! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
//! not created and the uploads are skipped. //! not created and the uploads are skipped.
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
//! the pageserver config at any time, since it doesn't make a difference to
//! [`Timeline::load_layer_map`].
//! Of course, the remote timeline dir must not change while we have de-configured
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
//! in remote storage.
//! But note that we don't test any of this right now.
//! //!
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -187,7 +211,8 @@ mod upload;
use anyhow::Context; use anyhow::Context;
use camino::Utf8Path; use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc}; use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard; use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use utils::backoff::{ use utils::backoff::{
@@ -212,7 +237,7 @@ use crate::metrics::{
}; };
use crate::task_mgr::shutdown_token; use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::storage_layer::AsLayerDesc; use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::upload_queue::Delete; use crate::tenant::upload_queue::Delete;
use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::{ use crate::{
@@ -230,13 +255,10 @@ use utils::id::{TenantId, TimelineId};
use self::index::IndexPart; use self::index::IndexPart;
use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; use super::storage_layer::LayerFileName;
use super::upload_queue::SetDeletedFlagProgress; use super::upload_queue::SetDeletedFlagProgress;
use super::Generation; use super::Generation;
pub(crate) use download::{is_temp_download_file, list_remote_timelines};
pub(crate) use index::LayerFileMetadata;
// Occasional network issues and such can cause remote operations to fail, and // Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a download fails, we log it at info-level, and retry. // that's expected. If a download fails, we log it at info-level, and retry.
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
// //
/// Download index file /// Download index file
pub async fn download_index_file( pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
&self,
cancel: CancellationToken,
) -> Result<MaybeDeletedIndexPart, DownloadError> {
let _unfinished_gauge_guard = self.metrics.call_begin( let _unfinished_gauge_guard = self.metrics.call_begin(
&RemoteOpFileKind::Index, &RemoteOpFileKind::Index,
&RemoteOpKind::Download, &RemoteOpKind::Download,
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
&self.tenant_id, &self.tenant_id,
&self.timeline_id, &self.timeline_id,
self.generation, self.generation,
cancel,
) )
.measure_remote_op( .measure_remote_op(
self.tenant_id, self.tenant_id,
@@ -609,203 +627,101 @@ impl RemoteTimelineClient {
/// ///
/// Launch an upload operation in the background. /// Launch an upload operation in the background.
/// ///
pub(crate) fn schedule_layer_file_upload( pub fn schedule_layer_file_upload(
self: &Arc<Self>, self: &Arc<Self>,
layer: ResidentLayer, layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap(); let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?; let upload_queue = guard.initialized_mut()?;
self.schedule_layer_file_upload0(upload_queue, layer);
self.launch_queued_tasks(upload_queue);
Ok(())
}
fn schedule_layer_file_upload0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
layer: ResidentLayer,
) {
let metadata = layer.metadata();
upload_queue upload_queue
.latest_files .latest_files
.insert(layer.layer_desc().filename(), metadata.clone()); .insert(layer_file_name.clone(), layer_metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
info!("scheduled layer file upload {layer}"); let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
let op = UploadOp::UploadLayer(layer, metadata);
self.calls_unfinished_metric_begin(&op); self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op); upload_queue.queued_operations.push_back(op);
info!("scheduled layer file upload {layer_file_name}");
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
} }
/// Launch a delete operation in the background. /// Launch a delete operation in the background.
/// ///
/// The operation does not modify local filesystem state. /// The operation does not modify local state but assumes the local files have already been
/// deleted, and is used to mirror those changes to remote.
/// ///
/// Note: This schedules an index file upload before the deletions. The /// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until all previously scheduled /// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed /// upload operations, and the index file upload, have completed
/// successfully. /// successfully.
pub fn schedule_layer_file_deletion( pub fn schedule_layer_file_deletion(
self: &Arc<Self>, self: &Arc<Self>,
names: &[LayerFileName], names: Vec<LayerFileName>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap(); let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?; let upload_queue = guard.initialized_mut()?;
let with_generations =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
/// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
/// layer files, leaving them dangling.
///
/// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
/// is invoked on them.
pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// just forget the return value; after uploading the next index_part.json, we can consider
// the layer files as "dangling". this is fine, at worst case we create work for the
// scrubber.
let names = gc_layers.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
Ok(())
}
/// Update the remote index file, removing the to-be-deleted files from the index,
/// allowing scheduling of actual deletions later.
fn schedule_unlinking_of_layers_from_index_part0<I>(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> Vec<(LayerFileName, Generation)>
where
I: IntoIterator<Item = LayerFileName>,
{
// Deleting layers doesn't affect the values stored in TimelineMetadata, // Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it. // so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone(); let metadata = upload_queue.latest_metadata.clone();
// Decorate our list of names with each name's generation, dropping // Update the remote index file, removing the to-be-deleted files from the index,
// names that are unexpectedly missing from our metadata. // before deleting the actual files.
let with_generations: Vec<_> = names //
.into_iter() // Once we start removing files from upload_queue.latest_files, there's
.filter_map(|name| { // no going back! Otherwise, some of the files would already be removed
let meta = upload_queue.latest_files.remove(&name); // from latest_files, but not yet scheduled for deletion. Use a closure
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(&name);
if let Some(meta) = meta { if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name, meta.generation)) Some((name, meta.generation))
} else { } else {
// This can only happen if we forgot to to schedule the file upload // This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange // before scheduling the delete. Log it because it is a rare/strange
// situation, and in case something is misbehaving, we'd like to know which // situation, and in case something is misbehaving, we'd like to know which
// layers experienced this. // layers experienced this.
info!("Deleting layer {name} not found in latest_files list, never uploaded?"); info!(
None "Deleting layer {name} not found in latest_files list, never uploaded?"
} );
}) None
.collect(); }
})
.collect();
#[cfg(feature = "testing")] if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
for (name, gen) in &with_generations { self.schedule_index_upload(upload_queue, metadata);
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
if &unexpected == gen {
tracing::error!("{name} was unlinked twice with same generation");
} else {
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
}
} }
}
// after unlinking files from the upload_queue.latest_files we must always schedule an for (name, gen) in &with_generations {
// index_part update, because that needs to be uploaded before we can actually delete the info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
// files.
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata);
}
with_generations
}
/// Schedules deletion for layer files which have previously been unlinked from the
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerFileName, Generation)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_deletion_of_unlinked0(upload_queue, layers);
self.launch_queued_tasks(upload_queue);
Ok(())
}
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
with_generations: Vec<(LayerFileName, Generation)>,
) {
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
#[cfg(feature = "testing")]
for (name, gen) in &with_generations {
match upload_queue.dangling_files.remove(name) {
Some(same) if &same == gen => { /* expected */ }
Some(other) => {
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
}
None => {
tracing::error!("{name} was unlinked but was not dangling");
}
} }
}
// schedule the actual deletions // schedule the actual deletions
let op = UploadOp::Delete(Delete { let op = UploadOp::Delete(Delete {
layers: with_generations, layers: with_generations,
}); });
self.calls_unfinished_metric_begin(&op); self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op); upload_queue.queued_operations.push_back(op);
}
/// Schedules a compaction update to the remote `index_part.json`.
///
/// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
pub(crate) fn schedule_compaction_update(
self: &Arc<Self>,
compacted_from: &[Layer],
compacted_to: &[ResidentLayer],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
for layer in compacted_to {
self.schedule_layer_file_upload0(upload_queue, layer.clone());
}
let names = compacted_from.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
};
no_bail_here();
Ok(()) Ok(())
} }
@@ -1177,12 +1093,16 @@ impl RemoteTimelineClient {
} }
let upload_result: anyhow::Result<()> = match &task.op { let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata) => { UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
let path = layer.local_path(); let path = self
.conf
.timeline_path(&self.tenant_id, &self.timeline_id)
.join(layer_file_name.file_name());
upload::upload_timeline_layer( upload::upload_timeline_layer(
self.conf, self.conf,
&self.storage_impl, &self.storage_impl,
path, &path,
layer_metadata, layer_metadata,
self.generation, self.generation,
) )
@@ -1456,8 +1376,6 @@ impl RemoteTimelineClient {
num_inprogress_deletions: 0, num_inprogress_deletions: 0,
inprogress_tasks: HashMap::default(), inprogress_tasks: HashMap::default(),
queued_operations: VecDeque::default(), queued_operations: VecDeque::default(),
#[cfg(feature = "testing")]
dangling_files: HashMap::default(),
}; };
let upload_queue = std::mem::replace( let upload_queue = std::mem::replace(
@@ -1501,6 +1419,13 @@ impl RemoteTimelineClient {
} }
} }
} }
pub(crate) fn get_layer_metadata(
&self,
name: &LayerFileName,
) -> anyhow::Result<Option<LayerFileMetadata>> {
self.upload_queue.lock().unwrap().get_layer_metadata(name)
}
} }
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath { pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1542,7 +1467,7 @@ pub fn remote_index_path(
} }
/// Given the key of an index, parse out the generation part of the name /// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> { pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() { let file_name = match path.get_path().file_name() {
Some(f) => f, Some(f) => f,
None => { None => {
@@ -1588,7 +1513,6 @@ mod tests {
context::RequestContext, context::RequestContext,
tenant::{ tenant::{
harness::{TenantHarness, TIMELINE_ID}, harness::{TenantHarness, TIMELINE_ID},
storage_layer::Layer,
Generation, Tenant, Timeline, Generation, Tenant, Timeline,
}, },
DEFAULT_PG_VERSION, DEFAULT_PG_VERSION,
@@ -1731,11 +1655,7 @@ mod tests {
let client = timeline.remote_client.as_ref().unwrap(); let client = timeline.remote_client.as_ref().unwrap();
// Download back the index.json, and check that the list of files is correct // Download back the index.json, and check that the list of files is correct
let initial_index_part = match client let initial_index_part = match client.download_index_file().await.unwrap() {
.download_index_file(CancellationToken::new())
.await
.unwrap()
{
MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
}; };
@@ -1761,29 +1681,32 @@ mod tests {
let generation = harness.generation; let generation = harness.generation;
// Create a couple of dummy files, schedule upload for them // Create a couple of dummy files, schedule upload for them
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
let content_1 = dummy_contents("foo");
let content_2 = dummy_contents("bar");
let content_3 = dummy_contents("baz");
let layers = [ for (filename, content) in [
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")), (&layer_file_name_1, &content_1),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")), (&layer_file_name_2, &content_2),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) (&layer_file_name_3, &content_3),
] ] {
.into_iter() std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
.map(|(name, contents): (LayerFileName, Vec<u8>)| { }
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
Layer::for_resident(
harness.conf,
&timeline,
name,
LayerFileMetadata::new(contents.len() as u64, generation),
)
}).collect::<Vec<_>>();
client client
.schedule_layer_file_upload(layers[0].clone()) .schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64, generation),
)
.unwrap(); .unwrap();
client client
.schedule_layer_file_upload(layers[1].clone()) .schedule_layer_file_upload(
&layer_file_name_2,
&LayerFileMetadata::new(content_2.len() as u64, generation),
)
.unwrap(); .unwrap();
// Check that they are started immediately, not queued // Check that they are started immediately, not queued
@@ -1824,11 +1747,7 @@ mod tests {
} }
// Download back the index.json, and check that the list of files is correct // Download back the index.json, and check that the list of files is correct
let index_part = match client let index_part = match client.download_index_file().await.unwrap() {
.download_index_file(CancellationToken::new())
.await
.unwrap()
{
MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
}; };
@@ -1841,42 +1760,38 @@ mod tests {
.collect(), .collect(),
&[ &[
&initial_layer.file_name(), &initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(), &layer_file_name_1.file_name(),
&layers[1].layer_desc().filename().file_name(), &layer_file_name_2.file_name(),
], ],
); );
assert_eq!(index_part.metadata, metadata); assert_eq!(index_part.metadata, metadata);
// Schedule upload and then a deletion. Check that the deletion is queued // Schedule upload and then a deletion. Check that the deletion is queued
client client
.schedule_layer_file_upload(layers[2].clone()) .schedule_layer_file_upload(
&layer_file_name_3,
&LayerFileMetadata::new(content_3.len() as u64, generation),
)
.unwrap(); .unwrap();
// this is no longer consistent with how deletion works with Layer::drop, but in this test
// keep using schedule_layer_file_deletion because we don't have a way to wait for the
// spawn_blocking started by the drop.
client client
.schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
.unwrap(); .unwrap();
{ {
let mut guard = client.upload_queue.lock().unwrap(); let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap(); let upload_queue = guard.initialized_mut().unwrap();
// Deletion schedules upload of the index file, and the file deletion itself // Deletion schedules upload of the index file, and the file deletion itself
assert_eq!(upload_queue.queued_operations.len(), 2); assert!(upload_queue.queued_operations.len() == 2);
assert_eq!(upload_queue.inprogress_tasks.len(), 1); assert!(upload_queue.inprogress_tasks.len() == 1);
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1); assert!(upload_queue.num_inprogress_layer_uploads == 1);
assert_eq!(upload_queue.num_inprogress_deletions, 0); assert!(upload_queue.num_inprogress_deletions == 0);
assert_eq!( assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
0
);
} }
assert_remote_files( assert_remote_files(
&[ &[
&initial_layer.file_name(), &initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(), &layer_file_name_1.file_name(),
&layers[1].layer_desc().filename().file_name(), &layer_file_name_2.file_name(),
"index_part.json", "index_part.json",
], ],
&remote_timeline_dir, &remote_timeline_dir,
@@ -1890,8 +1805,8 @@ mod tests {
assert_remote_files( assert_remote_files(
&[ &[
&initial_layer.file_name(), &initial_layer.file_name(),
&layers[1].layer_desc().filename().file_name(), &layer_file_name_2.file_name(),
&layers[2].layer_desc().filename().file_name(), &layer_file_name_3.file_name(),
"index_part.json", "index_part.json",
], ],
&remote_timeline_dir, &remote_timeline_dir,
@@ -1920,13 +1835,6 @@ mod tests {
) )
.unwrap(); .unwrap();
let layer_file_1 = Layer::for_resident(
harness.conf,
&timeline,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
);
#[derive(Debug, PartialEq, Clone, Copy)] #[derive(Debug, PartialEq, Clone, Copy)]
struct BytesStartedFinished { struct BytesStartedFinished {
started: Option<usize>, started: Option<usize>,
@@ -1962,7 +1870,10 @@ mod tests {
let actual_a = get_bytes_started_stopped(); let actual_a = get_bytes_started_stopped();
client client
.schedule_layer_file_upload(layer_file_1.clone()) .schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
)
.unwrap(); .unwrap();
let actual_b = get_bytes_started_stopped(); let actual_b = get_bytes_started_stopped();
@@ -2027,7 +1938,7 @@ mod tests {
let client = test_state.build_client(get_generation); let client = test_state.build_client(get_generation);
let download_r = client let download_r = client
.download_index_file(CancellationToken::new()) .download_index_file()
.await .await
.expect("download should always succeed"); .expect("download should always succeed");
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));

View File

@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::LayerFileName; use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::Generation; use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension; use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
@@ -170,43 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
pub async fn list_remote_timelines( pub async fn list_remote_timelines(
storage: &GenericRemoteStorage, storage: &GenericRemoteStorage,
tenant_id: TenantId, tenant_id: TenantId,
cancel: CancellationToken, ) -> anyhow::Result<HashSet<TimelineId>> {
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
let remote_path = remote_timelines_path(&tenant_id); let remote_path = remote_timelines_path(&tenant_id);
fail::fail_point!("storage-sync-list-remote-timelines", |_| { fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines"); anyhow::bail!("storage-sync-list-remote-timelines");
}); });
let listing = download_retry_forever( let timelines = download_retry(
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter), || storage.list_prefixes(Some(&remote_path)),
&format!("list timelines for {tenant_id}"), &format!("list prefixes for {tenant_id}"),
cancel,
) )
.await?; .await?;
let mut timeline_ids = HashSet::new(); if timelines.is_empty() {
let mut other_prefixes = HashSet::new(); anyhow::bail!("no timelines found on the remote storage")
}
let mut timeline_ids = HashSet::new();
for timeline_remote_storage_key in timelines {
if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
// A `deleted` key within `timelines/` is a marker file, not a timeline. Ignore it.
// This code will be removed in https://github.com/neondatabase/neon/pull/5580
continue;
}
for timeline_remote_storage_key in listing.prefixes {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?; })?;
match object_name.parse::<TimelineId>() { let timeline_id: TimelineId = object_name
Ok(t) => timeline_ids.insert(t), .parse()
Err(_) => other_prefixes.insert(object_name.to_string()), .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
};
// list_prefixes is assumed to return unique names. Ensure this here.
// NB: it's safer to bail out than warn-log this because the pageserver
// needs to absolutely know about _all_ timelines that exist, so that
// GC knows all the branchpoints. If we skipped over a timeline instead,
// GC could delete a layer that's still needed by that timeline.
anyhow::ensure!(
!timeline_ids.contains(&timeline_id),
"list_prefixes contains duplicate timeline id {timeline_id}"
);
timeline_ids.insert(timeline_id);
} }
for key in listing.keys { Ok(timeline_ids)
let object_name = key
.object_name()
.ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
other_prefixes.insert(object_name.to_string());
}
Ok((timeline_ids, other_prefixes))
} }
async fn do_download_index_part( async fn do_download_index_part(
@@ -214,11 +224,10 @@ async fn do_download_index_part(
tenant_id: &TenantId, tenant_id: &TenantId,
timeline_id: &TimelineId, timeline_id: &TimelineId,
index_generation: Generation, index_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> { ) -> Result<IndexPart, DownloadError> {
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation); let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
let index_part_bytes = download_retry_forever( let index_part_bytes = download_retry(
|| async { || async {
let mut index_part_download = storage.download(&remote_path).await?; let mut index_part_download = storage.download(&remote_path).await?;
@@ -233,7 +242,6 @@ async fn do_download_index_part(
Ok(index_part_bytes) Ok(index_part_bytes)
}, },
&format!("download {remote_path:?}"), &format!("download {remote_path:?}"),
cancel,
) )
.await?; .await?;
@@ -255,28 +263,19 @@ pub(super) async fn download_index_part(
tenant_id: &TenantId, tenant_id: &TenantId,
timeline_id: &TimelineId, timeline_id: &TimelineId,
my_generation: Generation, my_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> { ) -> Result<IndexPart, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id(); debug_assert_current_span_has_tenant_and_timeline_id();
if my_generation.is_none() { if my_generation.is_none() {
// Operating without generations: just fetch the generation-less path // Operating without generations: just fetch the generation-less path
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel) return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
.await;
} }
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
// index in our generation. // index in our generation.
// //
// This is an optimization to avoid doing the listing for the general case below. // This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part( let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
storage,
tenant_id,
timeline_id,
my_generation,
cancel.clone(),
)
.await;
match res { match res {
Ok(index_part) => { Ok(index_part) => {
tracing::debug!( tracing::debug!(
@@ -296,14 +295,8 @@ pub(super) async fn download_index_part(
// we want to find the most recent index from a previous generation. // we want to find the most recent index from a previous generation.
// //
// This is an optimization to avoid doing the listing for the general case below. // This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part( let res =
storage, do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
tenant_id,
timeline_id,
my_generation.previous(),
cancel.clone(),
)
.await;
match res { match res {
Ok(index_part) => { Ok(index_part) => {
tracing::debug!("Found index_part from previous generation"); tracing::debug!("Found index_part from previous generation");
@@ -347,14 +340,13 @@ pub(super) async fn download_index_part(
match max_previous_generation { match max_previous_generation {
Some(g) => { Some(g) => {
tracing::debug!("Found index_part in generation {g:?}"); tracing::debug!("Found index_part in generation {g:?}");
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await do_download_index_part(storage, tenant_id, timeline_id, g).await
} }
None => { None => {
// Migration from legacy pre-generation state: we have a generation but no prior // Migration from legacy pre-generation state: we have a generation but no prior
// attached pageservers did. Try to load from a no-generation path. // attached pageservers did. Try to load from a no-generation path.
tracing::info!("No index_part.json* found"); tracing::info!("No index_part.json* found");
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel) do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
.await
} }
} }
} }
@@ -384,23 +376,3 @@ where
) )
.await .await
} }
async fn download_retry_forever<T, O, F>(
op: O,
description: &str,
cancel: CancellationToken,
) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
{
backoff::retry(
op,
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
FAILED_DOWNLOAD_WARN_THRESHOLD,
u32::MAX,
description,
backoff::Cancel::new(cancel, || DownloadError::Cancelled),
)
.await
}

Some files were not shown because too many files have changed in this diff Show More