diff --git a/.config/hakari.toml b/.config/hakari.toml index 15b939e86f..9913ecc9c0 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -22,5 +22,11 @@ platforms = [ # "x86_64-pc-windows-msvc", ] +[final-excludes] +# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but +# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded +# from depending on workspace-hack because most of the dependencies are not used. +workspace-members = ["vm_monitor"] + # Write out exact versions rather than a semver range. (Defaults to false.) # exact-versions = true diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 7707e0aa67..019e6e7345 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -17,8 +17,9 @@ assignees: '' ## Implementation ideas -## Tasks -- [ ] +```[tasklist] +### Tasks +``` ## Other related tasks and Epics diff --git a/.github/actionlint.yml b/.github/actionlint.yml index e2ece5f230..362480f256 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,5 +1,7 @@ self-hosted-runner: labels: + - arm64 + - dev - gen3 - large - small diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0f8afc954..ff7d8c1a62 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -172,10 +172,10 @@ jobs: # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources if: ${{ !cancelled() }} - run: cargo deny check + run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions ] + needs: [ check-permissions, tag ] runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -187,6 +187,7 @@ jobs: env: BUILD_TYPE: ${{ matrix.build_type }} GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG: ${{ needs.tag.outputs.build-tag }} steps: - name: Fix git ownership @@ -723,6 +724,7 @@ jobs: --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index d7f5295c5b..0d7db8dfbc 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -21,7 +21,10 @@ env: jobs: check-macos-build: - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') + if: | + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' timeout-minutes: 90 runs-on: macos-latest @@ -112,8 +115,182 @@ jobs: - name: Check that no warnings are produced run: ./run_clippy.sh + check-linux-arm-build: + timeout-minutes: 90 + runs-on: [ self-hosted, dev, arm64 ] + + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + CARGO_FEATURES: --features testing + CARGO_FLAGS: --locked --release + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + + - name: Set env variables + run: | + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v3 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v3 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + + - name: Build walproposer-lib + run: mold -run make walproposer-lib -j$(nproc) + + - name: Run cargo build + run: | + mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + + - name: Run cargo test + run: | + cargo test $CARGO_FLAGS $CARGO_FEATURES + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_REGION=eu-central-1 + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + + check-codestyle-rust-arm: + timeout-minutes: 90 + runs-on: [ self-hosted, dev, arm64 ] + + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS + - name: Run cargo clippy (release) + run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS + + - name: Check documentation generation + run: cargo doc --workspace --no-deps --document-private-items + env: + RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" + + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run + - name: Check formatting + if: ${{ !cancelled() }} + run: cargo fmt --all -- --check + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check rust dependencies + if: ${{ !cancelled() }} + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + + # https://github.com/EmbarkStudios/cargo-deny + - name: Check rust licenses/bans/advisories/sources + if: ${{ !cancelled() }} + run: cargo deny check + gather-rust-build-stats: - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') + if: | + contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned diff --git a/Cargo.lock b/Cargo.lock index a7e88688ce..4cada013d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,69 +23,16 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aead" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fc95d1bdb8e6666b2b217308eeeb09f2d6728d104be3e31916cc74d15420331" -dependencies = [ - "generic-array", -] - -[[package]] -name = "aes" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "884391ef1066acaa41e766ba8f596341b96e93ce34f9a43e7d24bf0a0eaf0561" -dependencies = [ - "aes-soft", - "aesni", - "cipher", -] - -[[package]] -name = "aes-gcm" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5278b5fabbb9bd46e24aa69b2fdea62c99088e0a950a9be40e3e0101298f88da" -dependencies = [ - "aead", - "aes", - "cipher", - "ctr", - "ghash", - "subtle", -] - -[[package]] -name = "aes-soft" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be14c7498ea50828a38d0e24a765ed2effe92a705885b57d029cd67d45744072" -dependencies = [ - "cipher", - "opaque-debug", -] - -[[package]] -name = "aesni" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2e11f5e94c2f7d386164cc2aa1f97823fed6f259e486940a71c174dd01b0ce" -dependencies = [ - "cipher", - "opaque-debug", -] - [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" dependencies = [ "cfg-if", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -170,6 +117,12 @@ dependencies = [ "backtrace", ] +[[package]] +name = "arc-swap" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6" + [[package]] name = "archery" version = "0.5.0" @@ -192,7 +145,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.21", + "time", ] [[package]] @@ -242,55 +195,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "async-executor" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1da3ae8dabd9c00f453a329dfe1fb28da3c0a72e2478cdcd93171740c20499" -dependencies = [ - "async-lock", - "async-task", - "concurrent-queue", - "fastrand 2.0.0", - "futures-lite", - "slab", -] - -[[package]] -name = "async-global-executor" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" -dependencies = [ - "async-channel", - "async-executor", - "async-io", - "async-lock", - "blocking", - "futures-lite", - "once_cell", -] - -[[package]] -name = "async-io" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" -dependencies = [ - "async-lock", - "autocfg", - "cfg-if", - "concurrent-queue", - "futures-lite", - "log", - "parking", - "polling", - "rustix 0.37.25", - "slab", - "socket2 0.4.9", - "waker-fn", -] - [[package]] name = "async-lock" version = "2.8.0" @@ -300,32 +204,6 @@ dependencies = [ "event-listener", ] -[[package]] -name = "async-std" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" -dependencies = [ - "async-channel", - "async-global-executor", - "async-io", - "async-lock", - "crossbeam-utils", - "futures-channel", - "futures-core", - "futures-io", - "futures-lite", - "gloo-timers", - "kv-log-macro", - "log", - "memchr", - "once_cell", - "pin-project-lite", - "pin-utils", - "slab", - "wasm-bindgen-futures", -] - [[package]] name = "async-stream" version = "0.3.5" @@ -348,12 +226,6 @@ dependencies = [ "syn 2.0.28", ] -[[package]] -name = "async-task" -version = "4.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9441c6b2fe128a7c2bf680a44c34d0df31ce09e5b7e401fcca3faa483dbc921" - [[package]] name = "async-trait" version = "0.1.68" @@ -374,12 +246,6 @@ dependencies = [ "critical-section", ] -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.1.0" @@ -409,7 +275,7 @@ dependencies = [ "http", "hyper", "ring", - "time 0.3.21", + "time", "tokio", "tower", "tracing", @@ -562,13 +428,13 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "hmac 0.12.1", + "hmac", "http", "once_cell", "percent-encoding", "regex", - "sha2 0.10.6", - "time 0.3.21", + "sha2", + "time", "tracing", ] @@ -600,8 +466,8 @@ dependencies = [ "http-body", "md-5", "pin-project-lite", - "sha1 0.10.5", - "sha2 0.10.6", + "sha1", + "sha2", "tracing", ] @@ -746,7 +612,7 @@ dependencies = [ "num-integer", "ryu", "serde", - "time 0.3.21", + "time", ] [[package]] @@ -770,7 +636,7 @@ dependencies = [ "aws-smithy-http", "aws-smithy-types", "http", - "rustc_version 0.4.0", + "rustc_version", "tracing", ] @@ -800,7 +666,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_urlencoded", - "sha1 0.10.5", + "sha1", "sync_wrapper", "tokio", "tokio-tungstenite", @@ -845,10 +711,10 @@ dependencies = [ "quick-xml", "rand 0.8.5", "reqwest", - "rustc_version 0.4.0", + "rustc_version", "serde", "serde_json", - "time 0.3.21", + "time", "url", "uuid", ] @@ -868,7 +734,7 @@ dependencies = [ "pin-project", "serde", "serde_json", - "time 0.3.21", + "time", "tz-rs", "url", "uuid", @@ -885,13 +751,13 @@ dependencies = [ "azure_core", "bytes", "futures", - "hmac 0.12.1", + "hmac", "log", "serde", "serde_derive", "serde_json", - "sha2 0.10.6", - "time 0.3.21", + "sha2", + "time", "url", "uuid", ] @@ -911,7 +777,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "time 0.3.21", + "time", "url", "uuid", ] @@ -931,12 +797,6 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "base-x" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" - [[package]] name = "base64" version = "0.13.1" @@ -1009,15 +869,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array", -] - [[package]] name = "block-buffer" version = "0.10.4" @@ -1027,22 +878,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "blocking" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c36a4d0d48574b3dd360b4b7d95cc651d2b6557b6402848a27d4b228a473e2a" -dependencies = [ - "async-channel", - "async-lock", - "async-task", - "fastrand 2.0.0", - "futures-io", - "futures-lite", - "piper", - "tracing", -] - [[package]] name = "bstr" version = "1.5.0" @@ -1187,15 +1022,6 @@ dependencies = [ "half", ] -[[package]] -name = "cipher" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12f8e7987cbd042a63249497f41aed09f8e65add917ea6566effbc56578d6801" -dependencies = [ - "generic-array", -] - [[package]] name = "clang-sys" version = "1.6.1" @@ -1413,23 +1239,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "cookie" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a5d7b21829bc7b4bf4754a978a241ae54ea55a40f92bb20216e54096f4b951" -dependencies = [ - "aes-gcm", - "base64 0.13.1", - "hkdf", - "hmac 0.10.1", - "percent-encoding", - "rand 0.8.5", - "sha2 0.9.9", - "time 0.2.27", - "version_check", -] - [[package]] name = "core-foundation" version = "0.9.3" @@ -1455,19 +1264,13 @@ dependencies = [ "libc", ] -[[package]] -name = "cpuid-bool" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb25d077389e53838a8158c8e99174c5a9d902dee4904320db714f3c653ffba" - [[package]] name = "crc32c" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version 0.4.0", + "rustc_version", ] [[package]] @@ -1599,25 +1402,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "crypto-mac" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6" -dependencies = [ - "generic-array", - "subtle", -] - -[[package]] -name = "ctr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a30d54f7443bf3d6191dcd486aca19e67cb3c49fa7a06a319966346707e7f" -dependencies = [ - "cipher", -] - [[package]] name = "darling" version = "0.20.1" @@ -1696,32 +1480,17 @@ dependencies = [ "rusticata-macros", ] -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array", -] - [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer 0.10.4", + "block-buffer", "crypto-common", "subtle", ] -[[package]] -name = "discard" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0" - [[package]] name = "displaydoc" version = "0.2.4" @@ -2088,16 +1857,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "ghash" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97304e4cd182c3846f7575ced3890c53012ce534ad9114046b0a9e00bb30a375" -dependencies = [ - "opaque-debug", - "polyval", -] - [[package]] name = "gimli" version = "0.27.2" @@ -2132,18 +1891,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" -[[package]] -name = "gloo-timers" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" -dependencies = [ - "futures-channel", - "futures-core", - "js-sys", - "wasm-bindgen", -] - [[package]] name = "h2" version = "0.3.19" @@ -2215,7 +1962,7 @@ source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963b dependencies = [ "atomic-polyfill", "hash32", - "rustc_version 0.4.0", + "rustc_version", "spin 0.9.8", "stable_deref_trait", ] @@ -2257,33 +2004,13 @@ dependencies = [ "thiserror", ] -[[package]] -name = "hkdf" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51ab2f639c231793c5f6114bdb9bbe50a7dbbfcd7c7c6bd8475dec2d991e964f" -dependencies = [ - "digest 0.9.0", - "hmac 0.10.1", -] - -[[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac", - "digest 0.9.0", -] - [[package]] name = "hmac" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest 0.10.7", + "digest", ] [[package]] @@ -2327,9 +2054,7 @@ checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" dependencies = [ "anyhow", "async-channel", - "async-std", "base64 0.13.1", - "cookie", "futures-lite", "infer", "pin-project-lite", @@ -2643,15 +2368,6 @@ dependencies = [ "libc", ] -[[package]] -name = "kv-log-macro" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" -dependencies = [ - "log", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -2707,9 +2423,6 @@ name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -dependencies = [ - "value-bag", -] [[package]] name = "match_cfg" @@ -2738,7 +2451,7 @@ version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" dependencies = [ - "digest 0.10.7", + "digest", ] [[package]] @@ -2984,7 +2697,7 @@ dependencies = [ "serde", "serde_json", "serde_path_to_error", - "sha2 0.10.6", + "sha2", "thiserror", "url", ] @@ -3019,12 +2732,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - [[package]] name = "openssl" version = "0.10.55" @@ -3378,10 +3085,10 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" dependencies = [ - "digest 0.10.7", - "hmac 0.12.1", + "digest", + "hmac", "password-hash", - "sha2 0.10.6", + "sha2", ] [[package]] @@ -3475,17 +3182,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piper" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668d31b1c4eba19242f2088b2bf3316b82ca31082a8335764db4e083db7485d4" -dependencies = [ - "atomic-waker", - "fastrand 2.0.0", - "futures-io", -] - [[package]] name = "pkg-config" version = "0.3.27" @@ -3520,33 +3216,6 @@ dependencies = [ "plotters-backend", ] -[[package]] -name = "polling" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" -dependencies = [ - "autocfg", - "bitflags", - "cfg-if", - "concurrent-queue", - "libc", - "log", - "pin-project-lite", - "windows-sys 0.48.0", -] - -[[package]] -name = "polyval" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" -dependencies = [ - "cpuid-bool", - "opaque-debug", - "universal-hash", -] - [[package]] name = "postgres" version = "0.19.4" @@ -3580,12 +3249,12 @@ dependencies = [ "byteorder", "bytes", "fallible-iterator", - "hmac 0.12.1", + "hmac", "lazy_static", "md-5", "memchr", "rand 0.8.5", - "sha2 0.10.6", + "sha2", "stringprep", ] @@ -3814,7 +3483,7 @@ dependencies = [ "hashbrown 0.13.2", "hashlink", "hex", - "hmac 0.12.1", + "hmac", "hostname", "humantime", "hyper", @@ -3847,7 +3516,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sha2 0.10.6", + "sha2", "socket2 0.5.3", "sync_wrapper", "thiserror", @@ -3989,7 +3658,7 @@ checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" dependencies = [ "pem 2.0.1", "ring", - "time 0.3.21", + "time", "yasna", ] @@ -4058,6 +3727,7 @@ dependencies = [ "aws-config", "aws-credential-types", "aws-sdk-s3", + "aws-smithy-async", "aws-smithy-http", "aws-types", "azure_core", @@ -4245,7 +3915,7 @@ dependencies = [ "futures", "futures-timer", "rstest_macros", - "rustc_version 0.4.0", + "rustc_version", ] [[package]] @@ -4260,7 +3930,7 @@ dependencies = [ "quote", "regex", "relative-path", - "rustc_version 0.4.0", + "rustc_version", "syn 2.0.28", "unicode-ident", ] @@ -4277,22 +3947,13 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc_version" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" -dependencies = [ - "semver 0.9.0", -] - [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver 1.0.17", + "semver", ] [[package]] @@ -4554,27 +4215,12 @@ dependencies = [ "libc", ] -[[package]] -name = "semver" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" -dependencies = [ - "semver-parser", -] - [[package]] name = "semver" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" -[[package]] -name = "semver-parser" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" - [[package]] name = "sentry" version = "0.31.6" @@ -4615,7 +4261,7 @@ dependencies = [ "hostname", "libc", "os_info", - "rustc_version 0.4.0", + "rustc_version", "sentry-core", "uname", ] @@ -4667,7 +4313,7 @@ dependencies = [ "serde", "serde_json", "thiserror", - "time 0.3.21", + "time", "url", "uuid", ] @@ -4768,7 +4414,7 @@ dependencies = [ "serde", "serde_json", "serde_with_macros", - "time 0.3.21", + "time", ] [[package]] @@ -4783,15 +4429,6 @@ dependencies = [ "syn 2.0.28", ] -[[package]] -name = "sha1" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1da05c97445caa12d05e848c4a4fcbbea29e748ac28f7e80e9b010392063770" -dependencies = [ - "sha1_smol", -] - [[package]] name = "sha1" version = "0.10.5" @@ -4800,26 +4437,7 @@ checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" dependencies = [ "cfg-if", "cpufeatures", - "digest 0.10.7", -] - -[[package]] -name = "sha1_smol" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" - -[[package]] -name = "sha2" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" -dependencies = [ - "block-buffer 0.9.0", - "cfg-if", - "cpufeatures", - "digest 0.9.0", - "opaque-debug", + "digest", ] [[package]] @@ -4830,7 +4448,7 @@ checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" dependencies = [ "cfg-if", "cpufeatures", - "digest 0.10.7", + "digest", ] [[package]] @@ -4887,7 +4505,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.21", + "time", ] [[package]] @@ -4952,70 +4570,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" -[[package]] -name = "standback" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff" -dependencies = [ - "version_check", -] - [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "stdweb" -version = "0.4.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5" -dependencies = [ - "discard", - "rustc_version 0.2.3", - "stdweb-derive", - "stdweb-internal-macros", - "stdweb-internal-runtime", - "wasm-bindgen", -] - -[[package]] -name = "stdweb-derive" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef" -dependencies = [ - "proc-macro2", - "quote", - "serde", - "serde_derive", - "syn 1.0.109", -] - -[[package]] -name = "stdweb-internal-macros" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11" -dependencies = [ - "base-x", - "proc-macro2", - "quote", - "serde", - "serde_derive", - "serde_json", - "sha1 0.6.1", - "syn 1.0.109", -] - -[[package]] -name = "stdweb-internal-runtime" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" - [[package]] name = "storage_broker" version = "0.1.0" @@ -5249,21 +4809,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "time" -version = "0.2.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242" -dependencies = [ - "const_fn", - "libc", - "standback", - "stdweb", - "time-macros 0.1.1", - "version_check", - "winapi", -] - [[package]] name = "time" version = "0.3.21" @@ -5276,7 +4821,7 @@ dependencies = [ "num_threads", "serde", "time-core", - "time-macros 0.2.9", + "time-macros", ] [[package]] @@ -5285,16 +4830,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" -[[package]] -name = "time-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1" -dependencies = [ - "proc-macro-hack", - "time-macros-impl", -] - [[package]] name = "time-macros" version = "0.2.9" @@ -5304,19 +4839,6 @@ dependencies = [ "time-core", ] -[[package]] -name = "time-macros-impl" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f" -dependencies = [ - "proc-macro-hack", - "proc-macro2", - "quote", - "standback", - "syn 1.0.109", -] - [[package]] name = "tinytemplate" version = "1.2.1" @@ -5678,7 +5200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e" dependencies = [ "crossbeam-channel", - "time 0.3.21", + "time", "tracing-subscriber", ] @@ -5813,7 +5335,7 @@ dependencies = [ "httparse", "log", "rand 0.8.5", - "sha1 0.10.5", + "sha1", "thiserror", "url", "utf-8", @@ -5885,16 +5407,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" -[[package]] -name = "universal-hash" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8326b2c654932e3e4f9196e69d08fdf7cfd718e1dc6f66b347e6024a0c961402" -dependencies = [ - "generic-array", - "subtle", -] - [[package]] name = "untrusted" version = "0.7.1" @@ -5951,6 +5463,7 @@ name = "utils" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "async-trait", "bincode", "byteorder", @@ -6011,12 +5524,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "value-bag" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe" - [[package]] name = "vcpkg" version = "0.2.15" @@ -6048,7 +5555,6 @@ dependencies = [ "tokio-util", "tracing", "tracing-subscriber", - "workspace_hack", ] [[package]] @@ -6476,6 +5982,7 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", + "dashmap", "either", "fail", "futures", @@ -6506,11 +6013,10 @@ dependencies = [ "serde", "serde_json", "smallvec", - "standback", "syn 1.0.109", "syn 2.0.28", - "time 0.3.21", - "time-macros 0.2.9", + "time", + "time-macros", "tokio", "tokio-rustls", "tokio-util", @@ -6538,7 +6044,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.21", + "time", ] [[package]] @@ -6562,7 +6068,27 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ - "time 0.3.21", + "time", +] + +[[package]] +name = "zerocopy" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d992db4858..363d4c6fe4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] anyhow = { version = "1.0", features = ["backtrace"] } +arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip"] } azure_core = "0.16" azure_identity = "0.16" @@ -47,6 +48,7 @@ async-trait = "0.1" aws-config = { version = "0.56", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.29" aws-smithy-http = "0.56" +aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] } aws-credential-types = "0.56" aws-types = "0.56" axum = { version = "0.6.20", features = ["ws"] } @@ -65,7 +67,7 @@ comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" crossbeam-utils = "0.8.5" -dashmap = "5.5.0" +dashmap = { version = "5.5.0", features = ["raw-api"] } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" @@ -81,7 +83,7 @@ hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" -http-types = "2" +http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" diff --git a/Dockerfile b/Dockerfile index eb4c4bba25..60de9cfa3e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN set -e \ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local +ARG BUILD_TAG # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. @@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ diff --git a/Makefile b/Makefile index 64bbc1677c..89acbe564a 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib # $(POSTGRES_INSTALL_DIR)/build/%/config.status: +@echo "Configuring Postgres $* build" + @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ + echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ + echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ + exit 1; } mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 9fd88e5818..373f05ab2f 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -710,8 +710,12 @@ impl ComputeNode { // `pg_ctl` for start / stop, so this just seems much easier to do as we already // have opened connection to Postgres and superuser access. #[instrument(skip_all)] - fn pg_reload_conf(&self, client: &mut Client) -> Result<()> { - client.simple_query("SELECT pg_reload_conf()")?; + fn pg_reload_conf(&self) -> Result<()> { + let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); + Command::new(pgctl_bin) + .args(["reload", "-D", &self.pgdata]) + .output() + .expect("cannot run pg_ctl process"); Ok(()) } @@ -724,9 +728,9 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.pgdata); config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?; + self.pg_reload_conf()?; let mut client = Client::connect(self.connstr.as_str(), NoTls)?; - self.pg_reload_conf(&mut client)?; // Proceed with post-startup configuration. Note, that order of operations is important. // Disable DDL forwarding because control plane already knows about these roles/databases. diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 3d7ed8c360..2278509c1f 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -78,7 +78,7 @@ use regex::Regex; use remote_storage::*; use serde_json; use std::io::Read; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroUsize; use std::path::Path; use std::str; use tar::Archive; @@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result, { let path: Utf8PathBuf = path.into(); - // SAFETY + // SAFETY: // pre_exec is marked unsafe because it runs between fork and exec. // Why is that dangerous in various ways? // Long answer: https://github.com/rust-lang/rust/issues/39575 diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 7592880402..bb79d36bfc 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -1,11 +1,10 @@ -// -// Local control plane. -// -// Can start, configure and stop postgres instances running as a local processes. -// -// Intended to be used in integration tests and in CLI tools for -// local installations. -// +//! Local control plane. +//! +//! Can start, configure and stop postgres instances running as a local processes. +//! +//! Intended to be used in integration tests and in CLI tools for +//! local installations. +#![deny(clippy::undocumented_unsafe_blocks)] pub mod attachment_service; mod background_process; diff --git a/deny.toml b/deny.toml index f4ea0d4dac..079dcac679 100644 --- a/deny.toml +++ b/deny.toml @@ -74,10 +74,30 @@ highlight = "all" workspace-default-features = "allow" external-default-features = "allow" allow = [] -deny = [] + skip = [] skip-tree = [] +[[bans.deny]] +# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol +# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust +name = "async-std" + +[[bans.deny]] +name = "async-io" + +[[bans.deny]] +name = "async-waker" + +[[bans.deny]] +name = "async-global-executor" + +[[bans.deny]] +name = "async-executor" + +[[bans.deny]] +name = "smol" + # This section is considered when running `cargo deny check sources`. # More documentation about the 'sources' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs index b660799ec0..210a52d089 100644 --- a/libs/compute_api/src/lib.rs +++ b/libs/compute_api/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] pub mod requests; pub mod responses; pub mod spec; diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 9e89327e84..810196aff6 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -1,6 +1,6 @@ -//! //! Shared code for consumption metics collection -//! +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use chrono::{DateTime, Utc}; use rand::Rng; use serde::{Deserialize, Serialize}; diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 0b3b24c18e..ed375a152f 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -2,6 +2,7 @@ //! make sure that we use the same dep version everywhere. //! Otherwise, we might not see all metrics registered via //! a default registry. +#![deny(clippy::undocumented_unsafe_blocks)] use once_cell::sync::Lazy; use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; pub use prometheus::opts; diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index d844021785..e49f7d00c1 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; /// Public API types diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 455fe7a481..1dae008a4f 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -2,6 +2,8 @@ //! To use, create PostgresBackend and run() it, passing the Handler //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use anyhow::Context; use bytes::Bytes; use futures::pin_mut; @@ -15,7 +17,7 @@ use std::{fmt, io}; use std::{future::Future, str::FromStr}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; -use tracing::{debug, error, info, trace}; +use tracing::{debug, error, info, trace, warn}; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ @@ -33,6 +35,11 @@ pub enum QueryError { /// We were instructed to shutdown while processing the query #[error("Shutting down")] Shutdown, + /// Authentication failure + #[error("Unauthorized: {0}")] + Unauthorized(std::borrow::Cow<'static, str>), + #[error("Simulated Connection Error")] + SimulatedConnectionError, /// Some other error #[error(transparent)] Other(#[from] anyhow::Error), @@ -47,8 +54,9 @@ impl From for QueryError { impl QueryError { pub fn pg_error_code(&self) -> &'static [u8; 5] { match self { - Self::Disconnected(_) => b"08006", // connection failure + Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN, + Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR, Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error } } @@ -608,7 +616,7 @@ impl PostgresBackend { if let Err(e) = handler.check_auth_jwt(self, jwt_response) { self.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), + &short_error(&e), Some(e.pg_error_code()), ))?; return Err(e); @@ -730,6 +738,9 @@ impl PostgresBackend { if let Err(e) = handler.process_query(self, query_string).await { match e { QueryError::Shutdown => return Ok(ProcessMsgResult::Break), + QueryError::SimulatedConnectionError => { + return Err(QueryError::SimulatedConnectionError) + } e => { log_query_error(query_string, &e); let short_error = short_error(&e); @@ -964,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String { match e { QueryError::Disconnected(connection_error) => connection_error.to_string(), QueryError::Shutdown => "shutdown".to_string(), + QueryError::Unauthorized(_e) => "JWT authentication error".to_string(), + QueryError::SimulatedConnectionError => "simulated connection error".to_string(), QueryError::Other(e) => format!("{e:#}"), } } @@ -980,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) { QueryError::Disconnected(other_connection_error) => { error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") } + QueryError::SimulatedConnectionError => { + error!("query handler for query '{query}' failed due to a simulated connection error") + } QueryError::Shutdown => { info!("query handler for '{query}' cancelled during tenant shutdown") } + QueryError::Unauthorized(e) => { + warn!("query handler for '{query}' failed with authentication error: {e}"); + } QueryError::Other(e) => { error!("query handler for '{query}' failed: {e:?}"); } diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index 35344a9168..35cb1a2691 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use anyhow::{bail, Context}; use itertools::Itertools; use std::borrow::Cow; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index d0009d0dc9..d10ebfe277 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -8,6 +8,7 @@ // modules included with the postgres_ffi macro depend on the types of the specific version's // types, and trigger a too eager lint. #![allow(clippy::duplicate_mod)] +#![deny(clippy::undocumented_unsafe_blocks)] use bytes::Bytes; use utils::bin_ser::SerializeError; @@ -20,6 +21,7 @@ macro_rules! postgres_ffi { pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] + #![allow(clippy::undocumented_unsafe_blocks)] use serde::{Deserialize, Serialize}; include!(concat!( diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 94bc7f60f8..41fc206cd7 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -1,6 +1,7 @@ //! Postgres protocol messages serialization-deserialization. See //! //! on message formats. +#![deny(clippy::undocumented_unsafe_blocks)] pub mod framed; diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 65b5392389..d7bcce28cb 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true anyhow.workspace = true async-trait.workspace = true once_cell.workspace = true +aws-smithy-async.workspace = true aws-smithy-http.workspace = true aws-types.workspace = true aws-config.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 9b5b340db0..e6d306ff66 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -6,19 +6,15 @@ //! * [`s3_bucket`] uses AWS S3 bucket as an external storage //! * [`azure_blob`] allows to use Azure Blob storage as an external storage //! +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; mod local_fs; mod s3_bucket; mod simulate_failures; -use std::{ - collections::HashMap, - fmt::Debug, - num::{NonZeroU32, NonZeroUsize}, - pin::Pin, - sync::Arc, -}; +use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc}; use anyhow::{bail, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -34,12 +30,6 @@ pub use self::{ }; use s3_bucket::RequestKind; -/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. -/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency -/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. -/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. -pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50; -pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// @@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap); /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { - /// Max allowed number of concurrent sync operations between the API user and the remote storage. - pub max_concurrent_syncs: NonZeroUsize, - /// Max allowed errors before the sync task is considered failed and evicted. - pub max_sync_errors: NonZeroU32, /// The storage connection configuration. pub storage: RemoteStorageKind, } @@ -540,18 +526,6 @@ impl RemoteStorageConfig { let use_azure = container_name.is_some() && container_region.is_some(); - let max_concurrent_syncs = NonZeroUsize::new( - parse_optional_integer("max_concurrent_syncs", toml)? - .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), - ) - .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; - - let max_sync_errors = NonZeroU32::new( - parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), - ) - .context("Failed to parse 'max_sync_errors' as a positive integer")?; - let default_concurrency_limit = if use_azure { DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT } else { @@ -633,11 +607,7 @@ impl RemoteStorageConfig { } }; - Ok(Some(RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, - storage, - })) + Ok(Some(RemoteStorageConfig { storage })) } } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 560a2c14e9..ab3fd3fe62 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,23 +4,27 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::borrow::Cow; +use std::{borrow::Cow, sync::Arc}; use anyhow::Context; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, - provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider, + imds::credentials::ImdsCredentialsProvider, + meta::credentials::CredentialsProviderChain, + provider_config::ProviderConfig, + retry::{RetryConfigBuilder, RetryMode}, + web_identity_token::WebIdentityTokenCredentialsProvider, }; use aws_credential_types::cache::CredentialsCache; use aws_sdk_s3::{ - config::{Config, Region}, + config::{AsyncSleep, Config, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, primitives::ByteStream, types::{Delete, ObjectIdentifier}, Client, }; +use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_http::body::SdkBody; use hyper::Body; use scopeguard::ScopeGuard; @@ -83,10 +87,23 @@ impl S3Bucket { .or_else("imds", ImdsCredentialsProvider::builder().build()) }; + // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off + let sleep_impl: Arc = Arc::new(TokioSleep::new()); + + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling + // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one + // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. + let mut retry_config = RetryConfigBuilder::new(); + retry_config + .set_max_attempts(Some(1)) + .set_mode(Some(RetryMode::Adaptive)); + let mut config_builder = Config::builder() .region(region) .credentials_cache(CredentialsCache::lazy()) - .credentials_provider(credentials_provider); + .credentials_provider(credentials_provider) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)) + .retry_config(retry_config.build()); if let Some(custom_endpoint) = aws_config.endpoint.clone() { config_builder = config_builder diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 0338270aaf..59b92b48fb 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::path::PathBuf; use std::sync::Arc; @@ -469,8 +469,6 @@ fn create_azure_client( let random = rand::thread_rng().gen::(); let remote_storage_config = RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), - max_sync_errors: NonZeroU32::new(5).unwrap(), storage: RemoteStorageKind::AzureContainer(AzureConfig { container_name: remote_storage_azure_container, container_region: remote_storage_azure_region, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 7e2aa9f6d7..e2360b7533 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::path::PathBuf; use std::sync::Arc; @@ -396,8 +396,6 @@ fn create_s3_client( let random = rand::thread_rng().gen::(); let remote_storage_config = RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), - max_sync_errors: NonZeroU32::new(5).unwrap(), storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: remote_storage_s3_bucket, bucket_region: remote_storage_s3_region, diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index 0a391478da..63c2c51188 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; /// Public API types diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index c151e3b42c..a3e12cf0e3 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -1,4 +1,6 @@ //! Synthetic size calculation +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] mod calculation; pub mod svg; diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index de0e2ad799..9cf2495771 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -32,6 +32,8 @@ //! .init(); //! } //! ``` +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] use opentelemetry::sdk::Resource; use opentelemetry::KeyValue; diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 3f4ef2abeb..ccf6f4f2d7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +arc-swap.workspace = true sentry.workspace = true async-trait.workspace = true anyhow.workspace = true diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 37299e4e7f..66b1f6e866 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,8 @@ // For details about authentication see docs/authentication.md +use arc_swap::ArcSwap; use serde; -use std::fs; +use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use anyhow::Result; use camino::Utf8Path; @@ -10,7 +11,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::id::TenantId; +use crate::{http::error::ApiError, id::TenantId}; /// Algorithm to use. We require EdDSA. const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; @@ -44,31 +45,106 @@ impl Claims { } } +pub struct SwappableJwtAuth(ArcSwap); + +impl SwappableJwtAuth { + pub fn new(jwt_auth: JwtAuth) -> Self { + SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth))) + } + pub fn swap(&self, jwt_auth: JwtAuth) { + self.0.swap(Arc::new(jwt_auth)); + } + pub fn decode(&self, token: &str) -> std::result::Result, AuthError> { + self.0.load().decode(token) + } +} + +impl std::fmt::Debug for SwappableJwtAuth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Swappable({:?})", self.0.load()) + } +} + +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub struct AuthError(pub Cow<'static, str>); + +impl Display for AuthError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for ApiError { + fn from(_value: AuthError) -> Self { + // Don't pass on the value of the AuthError as a precautionary measure. + // Being intentionally vague in public error communication hurts debugability + // but it is more secure. + ApiError::Forbidden("JWT authentication error".to_string()) + } +} + pub struct JwtAuth { - decoding_key: DecodingKey, + decoding_keys: Vec, validation: Validation, } impl JwtAuth { - pub fn new(decoding_key: DecodingKey) -> Self { + pub fn new(decoding_keys: Vec) -> Self { let mut validation = Validation::default(); validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM]; // The default 'required_spec_claims' is 'exp'. But we don't want to require // expiration. validation.required_spec_claims = [].into(); Self { - decoding_key, + decoding_keys, validation, } } pub fn from_key_path(key_path: &Utf8Path) -> Result { - let public_key = fs::read(key_path)?; - Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?)) + let metadata = key_path.metadata()?; + let decoding_keys = if metadata.is_dir() { + let mut keys = Vec::new(); + for entry in fs::read_dir(key_path)? { + let path = entry?.path(); + if !path.is_file() { + // Ignore directories (don't recurse) + continue; + } + let public_key = fs::read(path)?; + keys.push(DecodingKey::from_ed_pem(&public_key)?); + } + keys + } else if metadata.is_file() { + let public_key = fs::read(key_path)?; + vec![DecodingKey::from_ed_pem(&public_key)?] + } else { + anyhow::bail!("path is neither a directory or a file") + }; + if decoding_keys.is_empty() { + anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected."); + } + Ok(Self::new(decoding_keys)) } - pub fn decode(&self, token: &str) -> Result> { - Ok(decode(token, &self.decoding_key, &self.validation)?) + /// Attempt to decode the token with the internal decoding keys. + /// + /// The function tries the stored decoding keys in succession, + /// and returns the first yielding a successful result. + /// If there is no working decoding key, it returns the last error. + pub fn decode(&self, token: &str) -> std::result::Result, AuthError> { + let mut res = None; + for decoding_key in &self.decoding_keys { + res = Some(decode(token, decoding_key, &self.validation)); + if let Some(Ok(res)) = res { + return Ok(res); + } + } + if let Some(res) = res { + res.map_err(|e| AuthError(Cow::Owned(e.to_string()))) + } else { + Err(AuthError(Cow::Borrowed("no JWT decoding keys configured"))) + } } } @@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH "#; #[test] - fn test_decode() -> Result<(), anyhow::Error> { + fn test_decode() { let expected_claims = Claims { - tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, }; @@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; // Check it can be validated with the public key - let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); - let claims_from_token = auth.decode(encoded_eddsa)?.claims; + let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims; assert_eq!(claims_from_token, expected_claims); - - Ok(()) } #[test] - fn test_encode() -> Result<(), anyhow::Error> { + fn test_encode() { let claims = Claims { - tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, }; - let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?; + let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap(); // decode it back - let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); - let decoded = auth.decode(&encoded)?; + let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let decoded = auth.decode(&encoded).unwrap(); assert_eq!(decoded.claims, claims); - - Ok(()) } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index a9ee2425a4..550ab10700 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,4 +1,4 @@ -use crate::auth::{Claims, JwtAuth}; +use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use anyhow::Context; use hyper::header::{HeaderName, AUTHORIZATION}; @@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> { } pub fn auth_middleware( - provide_auth: fn(&Request) -> Option<&JwtAuth>, + provide_auth: fn(&Request) -> Option<&SwappableJwtAuth>, ) -> Middleware { Middleware::pre(move |req| async move { if let Some(auth) = provide_auth(&req) { @@ -400,9 +400,11 @@ pub fn auth_middleware( })?; let token = parse_token(header_value)?; - let data = auth - .decode(token) - .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?; + let data = auth.decode(token).map_err(|err| { + warn!("Authentication error: {err}"); + // Rely on From for ApiError impl + err + })?; req.set_context(data.claims); } None => { @@ -450,12 +452,11 @@ where pub fn check_permission_with( req: &Request, - check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>, + check_permission: impl Fn(&Claims) -> Result<(), AuthError>, ) -> Result<(), ApiError> { match req.context::() { - Some(claims) => { - Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?) - } + Some(claims) => Ok(check_permission(&claims) + .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?), None => Ok(()), // claims is None because auth is disabled } } diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 7233d3a662..ac68b04888 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::error::Error as StdError; use thiserror::Error; -use tracing::{error, info}; +use tracing::{error, info, warn}; #[derive(Debug, Error)] pub enum ApiError { @@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response { // Print a stack trace for Internal Server errors match api_error { + ApiError::Forbidden(_) | ApiError::Unauthorized(_) => { + warn!("Error processing HTTP request: {api_error:#}") + } ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index eef6a358e6..57dcc27719 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -120,6 +120,8 @@ impl Id { chunk[0] = HEX[((b >> 4) & 0xf) as usize]; chunk[1] = HEX[(b & 0xf) as usize]; } + + // SAFETY: vec constructed out of `HEX`, it can only be ascii unsafe { String::from_utf8_unchecked(buf) } } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 6ec17d2fc6..6ff2997636 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -1,5 +1,6 @@ //! `utils` is intended to be a place to put code that is shared //! between other crates in this repository. +#![deny(clippy::undocumented_unsafe_blocks)] pub mod backoff; diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 5bc7ca91d6..effc9c67b5 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -125,6 +125,9 @@ where // Wake everyone with an error. let mut internal = self.internal.lock().unwrap(); + // Block any future waiters from starting + internal.shutdown = true; + // This will steal the entire waiters map. // When we drop it all waiters will be woken. mem::take(&mut internal.waiters) diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs index 7eba905997..cb5a44d664 100644 --- a/libs/utils/src/shutdown.rs +++ b/libs/utils/src/shutdown.rs @@ -1,6 +1,7 @@ /// Immediately terminate the calling process without calling /// atexit callbacks, C runtime destructors etc. We mainly use /// this to protect coverage data from concurrent writes. -pub fn exit_now(code: u8) { +pub fn exit_now(code: u8) -> ! { + // SAFETY: exiting is safe, the ffi is not safe unsafe { nix::libc::_exit(code as _) }; } diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 1391d238e6..9aad0af22d 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -85,6 +85,13 @@ impl Gate { warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await } + /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This + /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking + /// the CancellationToken on such types is analogous to "Did shutdown start?" + pub fn close_complete(&self) -> bool { + self.sem.is_closed() + } + async fn do_close(&self) { tracing::debug!(gate = self.name, "Closing Gate..."); match self.sem.acquire_many(Self::MAX_UNITS).await { diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index 26b976830a..46e9f880a1 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -19,13 +19,12 @@ inotify.workspace = true serde.workspace = true serde_json.workspace = true sysinfo.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["rt-multi-thread"] } tokio-postgres.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true -workspace_hack = { version = "0.1", path = "../../workspace_hack" } [target.'cfg(target_os = "linux")'.dependencies] cgroups-rs = "0.3.3" diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index a844f78bd6..89ca91fdd7 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] #![cfg(target_os = "linux")] use anyhow::Context; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index b0ac2a879e..7f1bbc3b80 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -188,6 +188,7 @@ extern "C" fn recovery_download( } } +#[allow(clippy::unnecessary_cast)] extern "C" fn wal_read( sk: *mut Safekeeper, buf: *mut ::std::os::raw::c_char, @@ -421,6 +422,7 @@ impl std::fmt::Display for Level { } /// Take ownership of `Vec` from StringInfoData. +#[allow(clippy::unnecessary_cast)] pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option> { if pg.data.is_null() { return None; diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 4be15344c5..0661d3a969 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -186,7 +186,7 @@ impl Wrapper { .unwrap() .into_bytes_with_nul(); assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity()); - let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8; + let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char; let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void; diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 268117cae2..2cb661863d 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -1,22 +1,21 @@ -use anyhow::{bail, Result}; -use utils::auth::{Claims, Scope}; +use utils::auth::{AuthError, Claims, Scope}; use utils::id::TenantId; -pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<(), AuthError> { match (&claims.scope, tenant_id) { - (Scope::Tenant, None) => { - bail!("Attempt to access management api with tenant scope. Permission denied") - } + (Scope::Tenant, None) => Err(AuthError( + "Attempt to access management api with tenant scope. Permission denied".into(), + )), (Scope::Tenant, Some(tenant_id)) => { if claims.tenant_id.unwrap() != tenant_id { - bail!("Tenant id mismatch. Permission denied") + return Err(AuthError("Tenant id mismatch. Permission denied".into())); } Ok(()) } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::SafekeeperData, _) => { - bail!("SafekeeperData scope makes no sense for Pageserver") - } + (Scope::SafekeeperData, _) => Err(AuthError( + "SafekeeperData scope makes no sense for Pageserver".into(), + )), } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index df2db25ec6..5b0c140d00 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -34,8 +34,11 @@ use postgres_backend::AuthType; use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry, - signals::Signal, tcp_listener, + auth::{JwtAuth, SwappableJwtAuth}, + logging, project_build_tag, project_git_version, + sentry_init::init_sentry, + signals::Signal, + tcp_listener, }; project_git_version!(GIT_VERSION); @@ -321,13 +324,12 @@ fn start_pageserver( let http_auth; let pg_auth; if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { - // unwrap is ok because check is performed when creating config, so path is set and file exists + // unwrap is ok because check is performed when creating config, so path is set and exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); - info!( - "Loading public key for verifying JWT tokens from {:#?}", - key_path - ); - let auth: Arc = Arc::new(JwtAuth::from_key_path(key_path)?); + info!("Loading public key(s) for verifying JWT tokens from {key_path:?}"); + + let jwt_auth = JwtAuth::from_key_path(key_path)?; + let auth: Arc = Arc::new(SwappableJwtAuth::new(jwt_auth)); http_auth = match &conf.http_auth_type { AuthType::Trust => None, @@ -410,7 +412,7 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); - BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( + let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, TenantSharedResources { broker_client: broker_client.clone(), @@ -420,6 +422,7 @@ fn start_pageserver( order, shutdown_pageserver.clone(), ))?; + let tenant_manager = Arc::new(tenant_manager); BACKGROUND_RUNTIME.spawn({ let init_done_rx = init_done_rx; @@ -548,6 +551,7 @@ fn start_pageserver( let router_state = Arc::new( http::routes::State::new( conf, + tenant_manager, http_auth.clone(), remote_storage.clone(), broker_client.clone(), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a58c08e29b..87d9cc522e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -161,7 +161,7 @@ pub struct PageServerConf { pub http_auth_type: AuthType, /// authentication method for libpq connections from compute pub pg_auth_type: AuthType, - /// Path to a file containing public key for verifying JWT tokens. + /// Path to a file or directory containing public key(s) for verifying JWT tokens. /// Used for both mgmt and compute auth, if enabled. pub auth_validation_public_key_path: Option, @@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS - ) - .unwrap(), - max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) - .unwrap(), storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), }, "Remote storage config should correctly parse the local FS config and fill other storage defaults" @@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: bucket_name.clone(), bucket_region: bucket_region.clone(), diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index b6f889c682..6a732d1029 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -893,14 +893,6 @@ mod test { std::fs::create_dir_all(remote_fs_dir)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), }; let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 81cb016d49..57421b1547 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -55,21 +55,24 @@ impl Deleter { /// Wrap the remote `delete_objects` with a failpoint async fn remote_delete(&self) -> Result<(), anyhow::Error> { - fail::fail_point!("deletion-queue-before-execute", |_| { - info!("Skipping execution, failpoint set"); - metrics::DELETION_QUEUE - .remote_errors - .with_label_values(&["failpoint"]) - .inc(); - Err(anyhow::anyhow!("failpoint hit")) - }); - // A backoff::retry is used here for two reasons: // - To provide a backoff rather than busy-polling the API on errors // - To absorb transient 429/503 conditions without hitting our error // logging path for issues deleting objects. backoff::retry( - || async { self.remote_storage.delete_objects(&self.accumulator).await }, + || async { + fail::fail_point!("deletion-queue-before-execute", |_| { + info!("Skipping execution, failpoint set"); + + metrics::DELETION_QUEUE + .remote_errors + .with_label_values(&["failpoint"]) + .inc(); + Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute")) + }); + + self.remote_storage.delete_objects(&self.accumulator).await + }, |_| false, 3, 10, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index d31e4a1554..4d455243f0 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -52,6 +52,31 @@ paths: schema: type: object + /v1/reload_auth_validation_keys: + post: + description: Reloads the JWT public keys from their pre-configured location on disk. + responses: + "200": + description: The reload completed successfully. + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error (also hits if no keys were found) + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}: parameters: - name: tenant_id @@ -327,7 +352,8 @@ paths: in: query required: true schema: - type: integer + type: string + format: hex description: A LSN to get the timestamp responses: "200": diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3114ab469b..dcd5d31e26 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::JwtAuth; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -35,8 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::{ - GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError, - TenantSlotUpsertError, TenantStateError, + GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, + TenantSlotError, TenantSlotUpsertError, TenantStateError, }; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; @@ -45,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use utils::{ - auth::JwtAuth, + auth::SwappableJwtAuth, generation::Generation, http::{ endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, @@ -63,7 +64,8 @@ use super::models::ConfigureFailpointsRequest; pub struct State { conf: &'static PageServerConf, - auth: Option>, + tenant_manager: Arc, + auth: Option>, allowlist_routes: Vec, remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, @@ -74,7 +76,8 @@ pub struct State { impl State { pub fn new( conf: &'static PageServerConf, - auth: Option>, + tenant_manager: Arc, + auth: Option>, remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, @@ -86,6 +89,7 @@ impl State { .collect::>(); Ok(Self { conf, + tenant_manager, auth, allowlist_routes, remote_storage, @@ -299,11 +303,7 @@ async fn build_timeline_info( // we're executing this function, we will outlive the timeline on-disk state. info.current_logical_size_non_incremental = Some( timeline - .get_current_logical_size_non_incremental( - info.last_record_lsn, - CancellationToken::new(), - ctx, - ) + .get_current_logical_size_non_incremental(info.last_record_lsn, ctx) .await?, ); } @@ -389,6 +389,32 @@ async fn status_handler( json_response(StatusCode::OK, StatusResponse { id: config.id }) } +async fn reload_auth_validation_keys_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + let config = get_config(&request); + let state = get_state(&request); + let Some(shared_auth) = &state.auth else { + return json_response(StatusCode::BAD_REQUEST, ()); + }; + // unwrap is ok because check is performed when creating config, so path is set and exists + let key_path = config.auth_validation_public_key_path.as_ref().unwrap(); + info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}"); + + match JwtAuth::from_key_path(key_path) { + Ok(new_auth) => { + shared_auth.swap(new_auth); + json_response(StatusCode::OK, ()) + } + Err(e) => { + warn!("Error reloading public keys from {key_path:?}: {e:}"); + json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + } + } +} + async fn timeline_create_handler( mut request: Request, _cancel: CancellationToken, @@ -1140,20 +1166,14 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - mgr::upsert_location( - state.conf, - tenant_id, - location_conf, - state.broker_client.clone(), - state.remote_storage.clone(), - state.deletion_queue_client.clone(), - &ctx, - ) - .await - // TODO: badrequest assumes the caller was asking for something unreasonable, but in - // principle we might have hit something like concurrent API calls to the same tenant, - // which is not a 400 but a 409. - .map_err(ApiError::BadRequest)?; + state + .tenant_manager + .upsert_location(tenant_id, location_conf, &ctx) + .await + // TODO: badrequest assumes the caller was asking for something unreasonable, but in + // principle we might have hit something like concurrent API calls to the same tenant, + // which is not a 400 but a 409. + .map_err(ApiError::BadRequest)?; json_response(StatusCode::OK, ()) } @@ -1458,7 +1478,7 @@ async fn timeline_collect_keyspace( let keys = timeline .collect_keyspace(at_lsn, &ctx) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, Partitioning { keys, at_lsn }) } @@ -1647,6 +1667,8 @@ where ); match handle.await { + // TODO: never actually return Err from here, always Ok(...) so that we can log + // spanned errors. Call api_error_handler instead and return appropriate Body. Ok(result) => result, Err(e) => { // The handler task panicked. We have a global panic handler that logs the @@ -1695,7 +1717,7 @@ where pub fn make_router( state: Arc, launch_ts: &'static LaunchTimestamp, - auth: Option>, + auth: Option>, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1724,6 +1746,9 @@ pub fn make_router( .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) }) + .post("/v1/reload_auth_validation_keys", |r| { + api_handler(r, reload_auth_validation_keys_handler) + }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 04c3ae9746..3f74694ef2 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(clippy::undocumented_unsafe_blocks)] + mod auth; pub mod basebackup; pub mod config; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 429ab801d9..4b52f07326 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1225,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the Postgres WAL redo process", - redo_histogram_time_buckets!(), - ) - .expect("failed to define a metric") -}); - pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", @@ -1261,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +pub(crate) struct WalRedoProcessCounters { + pub(crate) started: IntCounter, + pub(crate) killed_by_cause: enum_map::EnumMap, +} + +#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)] +pub(crate) enum WalRedoKillCause { + WalRedoProcessDrop, + NoLeakChildDrop, + Startup, +} + +impl Default for WalRedoProcessCounters { + fn default() -> Self { + let started = register_int_counter!( + "pageserver_wal_redo_process_started_total", + "Number of WAL redo processes started", + ) + .unwrap(); + + let killed = register_int_counter_vec!( + "pageserver_wal_redo_process_stopped_total", + "Number of WAL redo processes stopped", + &["cause"], + ) + .unwrap(); + Self { + started, + killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| { + let cause = ::from_usize(i); + let cause_str: &'static str = cause.into(); + killed.with_label_values(&[cause_str]) + })), + } + } +} + +pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = + Lazy::new(WalRedoProcessCounters::default); + /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, @@ -1928,7 +1959,6 @@ pub fn preinitialize_metrics() { &READ_NUM_FS_LAYERS, &WAIT_LSN_TIME, &WAL_REDO_TIME, - &WAL_REDO_WAIT_TIME, &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, ] diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6086d0b063..ee5f1732e4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use bytes::Bytes; use futures::Stream; +use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, @@ -39,7 +40,7 @@ use tracing::field; use tracing::*; use utils::id::ConnectionId; use utils::{ - auth::{Claims, JwtAuth, Scope}, + auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, lsn::Lsn, simple_rcu::RcuReadGuard, @@ -121,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() pub async fn libpq_listener_main( conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, - auth: Option>, + auth: Option>, listener: TcpListener, auth_type: AuthType, listener_ctx: RequestContext, @@ -189,7 +190,7 @@ pub async fn libpq_listener_main( async fn page_service_conn_main( conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, - auth: Option>, + auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, connection_ctx: RequestContext, @@ -217,9 +218,27 @@ async fn page_service_conn_main( // no write timeout is used, because the kernel is assumed to error writes after some time. let mut socket = tokio_io_timeout::TimeoutReader::new(socket); - // timeout should be lower, but trying out multiple days for - // - socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3))); + let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default + let socket_timeout_ms = (|| { + fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| { + // Exponential distribution for simulating + // poor network conditions, expect about avg_timeout_ms to be around 15 + // in tests + if let Some(avg_timeout_ms) = avg_timeout_ms { + let avg = avg_timeout_ms.parse::().unwrap() as f32; + let u = rand::random::(); + ((1.0 - u).ln() / (-avg)) as u64 + } else { + default_timeout_ms + } + }); + default_timeout_ms + })(); + + // A timeout here does not mean the client died, it can happen if it's just idle for + // a while: we will tear down this PageServerHandler and instantiate a new one if/when + // they reconnect. + socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = std::pin::pin!(socket); // XXX: pgbackend.run() should take the connection_ctx, @@ -252,7 +271,7 @@ async fn page_service_conn_main( struct PageServerHandler { _conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, - auth: Option>, + auth: Option>, claims: Option, /// The context created for the lifetime of the connection @@ -266,7 +285,7 @@ impl PageServerHandler { pub fn new( conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, - auth: Option>, + auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { @@ -493,7 +512,11 @@ impl PageServerHandler { }; if let Err(e) = &response { - if timeline.cancel.is_cancelled() { + // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, + // because wait_lsn etc will drop out + // is_stopping(): [`Timeline::flush_and_shutdown`] has entered + // is_canceled(): [`Timeline::shutdown`]` has entered + if timeline.cancel.is_cancelled() || timeline.is_stopping() { // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the // connection. @@ -897,7 +920,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<(), QueryError> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -909,7 +932,7 @@ impl PageServerHandler { .claims .as_ref() .expect("claims presence already checked"); - check_permission(claims, tenant_id) + check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } /// Shorthand for getting a reference to a Timeline of an Active tenant. @@ -948,16 +971,17 @@ where .auth .as_ref() .unwrap() - .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?) + .map_err(|e| QueryError::Unauthorized(e.0))?; if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { - return Err(QueryError::Other(anyhow::anyhow!( - "jwt token scope is Tenant, but tenant id is missing" - ))); + return Err(QueryError::Unauthorized( + "jwt token scope is Tenant, but tenant id is missing".into(), + )); } - info!( - "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", + debug!( + "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -979,9 +1003,13 @@ where pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { + fail::fail_point!("simulated-bad-compute-connection", |_| { + info!("Hit failpoint for bad connection"); + Err(QueryError::SimulatedConnectionError) + }); + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream ") { let (_, params_raw) = query_string.split_at("pagestream ".len()); let params = params_raw.split(' ').collect::>(); @@ -1330,6 +1358,9 @@ impl From for QueryError { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), + GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { + QueryError::Shutdown + } e => QueryError::Other(anyhow::anyhow!(e)), } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 938672a908..a38be87257 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -21,8 +21,8 @@ use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::ControlFlow; use std::ops::Range; -use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; +use utils::bin_ser::DeserializeError; use utils::exp_counter::ExpCounter; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -34,18 +34,30 @@ pub enum LsnForTimestamp { /// Found commits both before and after the given timestamp Present(Lsn), + /// Found no commits after the given timestamp, this means + /// that the newest data in the branch is older than the given + /// timestamp. + /// /// All commits <= LSN happened before the given timestamp Future(Lsn), + /// The queried timestamp is past our horizon we look back at (PITR) + /// /// All commits > LSN happened after the given timestamp, - /// but any commits older than the returned LSN - /// might have happened before or after the given timestamp. - /// We don't know because no data before the given lsn is available + /// but any commits < LSN might have happened before or after + /// the given timestamp. We don't know because no data before + /// the given lsn is available. Past(Lsn), /// We have found no commit with a timestamp, /// so we can't return anything meaningful. - /// The associated LSN is a lower bound value. + /// + /// The associated LSN is the lower bound value we can safely + /// create branches on, but no statement is made if it is + /// older or newer than the timestamp. + /// + /// This variant can e.g. be returned right after a + /// cluster import. NoData(Lsn), } @@ -57,6 +69,25 @@ pub enum CalculateLogicalSizeError { Other(#[from] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +pub(crate) enum CollectKeySpaceError { + #[error(transparent)] + Decode(#[from] DeserializeError), + #[error(transparent)] + PageRead(PageReconstructError), + #[error("cancelled")] + Cancelled, +} + +impl From for CollectKeySpaceError { + fn from(err: PageReconstructError) -> Self { + match err { + PageReconstructError::Cancelled => Self::Cancelled, + err => Self::PageRead(err), + } + } +} + impl From for CalculateLogicalSizeError { fn from(pre: PageReconstructError) -> Self { match pre { @@ -407,6 +438,11 @@ impl Timeline { } // If `found_smaller == true`, `low` is the LSN of the last commit record // before or at `search_timestamp` + // + // FIXME: it would be better to get the LSN of the previous commit. + // Otherwise, if you restore to the returned LSN, the database will + // include physical changes from later commits that will be marked + // as aborted, and will need to be vacuumed away. let commit_lsn = Lsn(low * 8); match (found_smaller, found_larger) { (false, false) => { @@ -419,7 +455,7 @@ impl Timeline { Ok(LsnForTimestamp::Past(min_lsn)) } (true, false) => { - // Only found a commit with timestamp smaller than the request. + // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future` // case, anyway. @@ -599,7 +635,6 @@ impl Timeline { pub async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, - cancel: CancellationToken, ctx: &RequestContext, ) -> Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); @@ -610,12 +645,8 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self - .list_rels(*spcnode, *dbnode, lsn, ctx) - .await - .context("list rels")? - { - if cancel.is_cancelled() { + for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? { + if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } let relsize_key = rel_size_to_key(rel); @@ -632,11 +663,11 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub async fn collect_keyspace( + pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -645,7 +676,7 @@ impl Timeline { // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; + let dbdir = DbDirectory::des(&buf)?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); @@ -678,7 +709,7 @@ impl Timeline { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); let buf = self.get(slrudir_key, lsn, ctx).await?; - let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; + let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { @@ -696,7 +727,7 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; + let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); for xid in xids { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a738633d5e..758f8b15a1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1841,7 +1841,13 @@ impl Tenant { timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); let span = Span::current(); - js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await }); + js.spawn(async move { + if freeze_and_flush { + timeline.flush_and_shutdown().instrument(span).await + } else { + timeline.shutdown().instrument(span).await + } + }); }) }; tracing::info!("Waiting for timelines..."); @@ -3528,10 +3534,6 @@ pub(crate) mod harness { let remote_fs_dir = conf.workdir.join("localfs"); std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { - // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), - // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), }; let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); @@ -4731,7 +4733,7 @@ mod tests { // Keeps uninit mark in place let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown(false) + .shutdown() .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id)) .await; std::mem::forget(tline); diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index bedf09a40c..6de2e95055 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -327,7 +327,7 @@ mod tests { let mut sz: u16 = rng.gen(); // Make 50% of the arrays small if rng.gen() { - sz |= 63; + sz &= 63; } random_array(sz.into()) }) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 576bcea2ce..c71eac0672 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -200,6 +200,22 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> { Ok(()) } +/// The TenantManager is responsible for storing and mutating the collection of all tenants +/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance +/// lives inside the TenantManager. +/// +/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach +/// the same tenant twice concurrently, or trying to configure the same tenant into secondary +/// and attached modes concurrently. +pub struct TenantManager { + conf: &'static PageServerConf, + // TODO: currently this is a &'static pointing to TENANTs. When we finish refactoring + // out of that static variable, the TenantManager can own this. + // See https://github.com/neondatabase/neon/issues/5796 + tenants: &'static std::sync::RwLock, + resources: TenantSharedResources, +} + fn emergency_generations( tenant_confs: &HashMap>, ) -> HashMap { @@ -366,7 +382,7 @@ pub async fn init_tenant_mgr( resources: TenantSharedResources, init_order: InitializationOrder, cancel: CancellationToken, -) -> anyhow::Result<()> { +) -> anyhow::Result { let mut tenants = HashMap::new(); let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); @@ -468,7 +484,12 @@ pub async fn init_tenant_mgr( assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); METRICS.tenant_slots.set(tenants.len() as u64); *tenants_map = TenantsMap::Open(tenants); - Ok(()) + + Ok(TenantManager { + conf, + tenants: &TENANTS, + resources, + }) } /// Wrapper for Tenant::spawn that checks invariants before running, and inserts @@ -545,8 +566,10 @@ pub(crate) async fn shutdown_all_tenants() { async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { use utils::completion; - // Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants. - let (in_progress_ops, tenants_to_shut_down) = { + let mut join_set = JoinSet::new(); + + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. + let (total_in_progress, total_attached) = { let mut m = tenants.write().unwrap(); match &mut *m { TenantsMap::Initializing => { @@ -556,78 +579,67 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { } TenantsMap::Open(tenants) => { let mut shutdown_state = HashMap::new(); - let mut in_progress_ops = Vec::new(); - let mut tenants_to_shut_down = Vec::new(); + let mut total_in_progress = 0; + let mut total_attached = 0; - for (k, v) in tenants.drain() { + for (tenant_id, v) in tenants.drain() { match v { TenantSlot::Attached(t) => { - tenants_to_shut_down.push(t.clone()); - shutdown_state.insert(k, TenantSlot::Attached(t)); + shutdown_state.insert(tenant_id, TenantSlot::Attached(t.clone())); + join_set.spawn( + async move { + let freeze_and_flush = true; + + let res = { + let (_guard, shutdown_progress) = completion::channel(); + t.shutdown(shutdown_progress, freeze_and_flush).await + }; + + if let Err(other_progress) = res { + // join the another shutdown in progress + other_progress.wait().await; + } + + // we cannot afford per tenant logging here, because if s3 is degraded, we are + // going to log too many lines + debug!("tenant successfully stopped"); + } + .instrument(info_span!("shutdown", %tenant_id)), + ); + + total_attached += 1; } TenantSlot::Secondary => { - shutdown_state.insert(k, TenantSlot::Secondary); + shutdown_state.insert(tenant_id, TenantSlot::Secondary); } TenantSlot::InProgress(notify) => { // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will // wait for their notifications to fire in this function. - in_progress_ops.push(notify); + join_set.spawn(async move { + notify.wait().await; + }); + + total_in_progress += 1; } } } *m = TenantsMap::ShuttingDown(shutdown_state); - (in_progress_ops, tenants_to_shut_down) + (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { - // TODO: it is possible that detach and shutdown happen at the same time. as a - // result, during shutdown we do not wait for detach. error!("already shutting down, this function isn't supposed to be called more than once"); return; } } }; + let started_at = std::time::Instant::now(); + info!( "Waiting for {} InProgress tenants and {} Attached tenants to shut down", - in_progress_ops.len(), - tenants_to_shut_down.len() + total_in_progress, total_attached ); - for barrier in in_progress_ops { - barrier.wait().await; - } - - info!( - "InProgress tenants shut down, waiting for {} Attached tenants to shut down", - tenants_to_shut_down.len() - ); - let started_at = std::time::Instant::now(); - let mut join_set = JoinSet::new(); - for tenant in tenants_to_shut_down { - let tenant_id = tenant.get_tenant_id(); - join_set.spawn( - async move { - let freeze_and_flush = true; - - let res = { - let (_guard, shutdown_progress) = completion::channel(); - tenant.shutdown(shutdown_progress, freeze_and_flush).await - }; - - if let Err(other_progress) = res { - // join the another shutdown in progress - other_progress.wait().await; - } - - // we cannot afford per tenant logging here, because if s3 is degraded, we are - // going to log too many lines - - debug!("tenant successfully stopped"); - } - .instrument(info_span!("shutdown", %tenant_id)), - ); - } - let total = join_set.len(); let mut panicked = 0; let mut buffering = true; @@ -640,7 +652,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { match joined { Ok(()) => {} Err(join_error) if join_error.is_cancelled() => { - unreachable!("we are not cancelling any of the futures"); + unreachable!("we are not cancelling any of the tasks"); } Err(join_error) if join_error.is_panic() => { // cannot really do anything, as this panic is likely a bug @@ -742,139 +754,134 @@ pub(crate) async fn set_new_tenant_config( Ok(()) } -#[instrument(skip_all, fields(%tenant_id))] -pub(crate) async fn upsert_location( - conf: &'static PageServerConf, - tenant_id: TenantId, - new_location_config: LocationConf, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, - deletion_queue_client: DeletionQueueClient, - ctx: &RequestContext, -) -> Result<(), anyhow::Error> { - info!("configuring tenant location {tenant_id} to state {new_location_config:?}"); +impl TenantManager { + #[instrument(skip_all, fields(%tenant_id))] + pub(crate) async fn upsert_location( + &self, + tenant_id: TenantId, + new_location_config: LocationConf, + ctx: &RequestContext, + ) -> Result<(), anyhow::Error> { + info!("configuring tenant location {tenant_id} to state {new_location_config:?}"); - // Special case fast-path for updates to Tenant: if our upsert is only updating configuration, - // then we do not need to set the slot to InProgress, we can just call into the - // existng tenant. - { - let locked = TENANTS.read().unwrap(); - let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?; - match (&new_location_config.mode, peek_slot) { - (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { - if attach_conf.generation == tenant.generation { - // A transition from Attached to Attached in the same generation, we may - // take our fast path and just provide the updated configuration - // to the tenant. - tenant.set_new_location_config(AttachedTenantConf::try_from( - new_location_config, - )?); + // Special case fast-path for updates to Tenant: if our upsert is only updating configuration, + // then we do not need to set the slot to InProgress, we can just call into the + // existng tenant. + { + let locked = self.tenants.read().unwrap(); + let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?; + match (&new_location_config.mode, peek_slot) { + (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { + if attach_conf.generation == tenant.generation { + // A transition from Attached to Attached in the same generation, we may + // take our fast path and just provide the updated configuration + // to the tenant. + tenant.set_new_location_config(AttachedTenantConf::try_from( + new_location_config, + )?); - // Persist the new config in the background, to avoid holding up any - // locks while we do so. - // TODO + // Persist the new config in the background, to avoid holding up any + // locks while we do so. + // TODO - return Ok(()); - } else { - // Different generations, fall through to general case + return Ok(()); + } else { + // Different generations, fall through to general case + } + } + _ => { + // Not an Attached->Attached transition, fall through to general case } } - _ => { - // Not an Attached->Attached transition, fall through to general case - } } - } - // General case for upserts to TenantsMap, excluding the case above: we will substitute an - // InProgress value to the slot while we make whatever changes are required. The state for - // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: - // the state is ill-defined while we're in transition. Transitions are async, but fast: we do - // not do significant I/O, and shutdowns should be prompt via cancellation tokens. - let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?; + // General case for upserts to TenantsMap, excluding the case above: we will substitute an + // InProgress value to the slot while we make whatever changes are required. The state for + // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: + // the state is ill-defined while we're in transition. Transitions are async, but fast: we do + // not do significant I/O, and shutdowns should be prompt via cancellation tokens. + let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?; - if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() { - // The case where we keep a Tenant alive was covered above in the special case - // for Attached->Attached transitions in the same generation. By this point, - // if we see an attached tenant we know it will be discarded and should be - // shut down. - let (_guard, progress) = utils::completion::channel(); + if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() { + // The case where we keep a Tenant alive was covered above in the special case + // for Attached->Attached transitions in the same generation. By this point, + // if we see an attached tenant we know it will be discarded and should be + // shut down. + let (_guard, progress) = utils::completion::channel(); - match tenant.get_attach_mode() { - AttachmentMode::Single | AttachmentMode::Multi => { - // Before we leave our state as the presumed holder of the latest generation, - // flush any outstanding deletions to reduce the risk of leaking objects. - deletion_queue_client.flush_advisory() + match tenant.get_attach_mode() { + AttachmentMode::Single | AttachmentMode::Multi => { + // Before we leave our state as the presumed holder of the latest generation, + // flush any outstanding deletions to reduce the risk of leaking objects. + self.resources.deletion_queue_client.flush_advisory() + } + AttachmentMode::Stale => { + // If we're stale there's not point trying to flush deletions + } + }; + + info!("Shutting down attached tenant"); + match tenant.shutdown(progress, false).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } } - AttachmentMode::Stale => { - // If we're stale there's not point trying to flush deletions + slot_guard.drop_old_value().expect("We just shut it down"); + } + + let tenant_path = self.conf.tenant_path(&tenant_id); + + let new_slot = match &new_location_config.mode { + LocationMode::Secondary(_) => { + let tenant_path = self.conf.tenant_path(&tenant_id); + // Directory doesn't need to be fsync'd because if we crash it can + // safely be recreated next time this tenant location is configured. + unsafe_create_dir_all(&tenant_path) + .await + .with_context(|| format!("Creating {tenant_path}"))?; + + Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + + TenantSlot::Secondary + } + LocationMode::Attached(_attach_config) => { + let timelines_path = self.conf.timelines_path(&tenant_id); + + // Directory doesn't need to be fsync'd because we do not depend on + // it to exist after crashes: it may be recreated when tenant is + // re-attached, see https://github.com/neondatabase/neon/issues/5550 + unsafe_create_dir_all(&timelines_path) + .await + .with_context(|| format!("Creating {timelines_path}"))?; + + Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + + let tenant = tenant_spawn( + self.conf, + tenant_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(new_location_config)?, + None, + self.tenants, + SpawnMode::Normal, + ctx, + )?; + + TenantSlot::Attached(tenant) } }; - info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { - Ok(()) => {} - Err(barrier) => { - info!("Shutdown already in progress, waiting for it to complete"); - barrier.wait().await; - } - } - slot_guard.drop_old_value().expect("We just shut it down"); + slot_guard.upsert(new_slot)?; + + Ok(()) } - - let tenant_path = conf.tenant_path(&tenant_id); - - let new_slot = match &new_location_config.mode { - LocationMode::Secondary(_) => { - let tenant_path = conf.tenant_path(&tenant_id); - // Directory doesn't need to be fsync'd because if we crash it can - // safely be recreated next time this tenant location is configured. - unsafe_create_dir_all(&tenant_path) - .await - .with_context(|| format!("Creating {tenant_path}"))?; - - Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - - TenantSlot::Secondary - } - LocationMode::Attached(_attach_config) => { - let timelines_path = conf.timelines_path(&tenant_id); - - // Directory doesn't need to be fsync'd because we do not depend on - // it to exist after crashes: it may be recreated when tenant is - // re-attached, see https://github.com/neondatabase/neon/issues/5550 - unsafe_create_dir_all(&timelines_path) - .await - .with_context(|| format!("Creating {timelines_path}"))?; - - Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; - - let tenant = tenant_spawn( - conf, - tenant_id, - &tenant_path, - TenantSharedResources { - broker_client, - remote_storage, - deletion_queue_client, - }, - AttachedTenantConf::try_from(new_location_config)?, - None, - &TENANTS, - SpawnMode::Normal, - ctx, - )?; - - TenantSlot::Attached(tenant) - } - }; - - slot_guard.upsert(new_slot)?; - - Ok(()) } #[derive(Debug, thiserror::Error)] @@ -1430,9 +1437,6 @@ pub struct SlotGuard { _completion: utils::completion::Completion, } -unsafe impl Send for SlotGuard {} -unsafe impl Sync for SlotGuard {} - impl SlotGuard { fn new( tenant_id: TenantId, @@ -1539,14 +1543,7 @@ impl SlotGuard { /// is responsible for protecting fn old_value_is_shutdown(&self) -> bool { match self.old_value.as_ref() { - Some(TenantSlot::Attached(tenant)) => { - // TODO: PR #5711 will add a gate that enables properly checking that - // shutdown completed. - matches!( - tenant.current_state(), - TenantState::Stopping { .. } | TenantState::Broken { .. } - ) - } + Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(), Some(TenantSlot::Secondary) => { // TODO: when adding secondary mode tenants, this will check for shutdown // in the same way that we do for `Tenant` above diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e4df94b8e9..a85dc9231c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -6,7 +6,6 @@ use std::sync::Arc; use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; -use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; @@ -350,10 +349,6 @@ async fn fill_logical_sizes( // our advantage with `?` error handling. let mut joinset = tokio::task::JoinSet::new(); - let cancel = tokio_util::sync::CancellationToken::new(); - // be sure to cancel all spawned tasks if we are dropped - let _dg = cancel.clone().drop_guard(); - // For each point that would benefit from having a logical size available, // spawn a Task to fetch it, unless we have it cached already. for seg in segments.iter() { @@ -371,15 +366,8 @@ async fn fill_logical_sizes( let parallel_size_calcs = Arc::clone(limit); let ctx = ctx.attached_child(); joinset.spawn( - calculate_logical_size( - parallel_size_calcs, - timeline, - lsn, - cause, - ctx, - cancel.child_token(), - ) - .in_current_span(), + calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx) + .in_current_span(), ); } e.insert(cached_size); @@ -487,14 +475,13 @@ async fn calculate_logical_size( lsn: utils::lsn::Lsn, cause: LogicalSizeCalculationCause, ctx: RequestContext, - cancel: CancellationToken, ) -> Result { let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); let size_res = timeline - .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel) + .spawn_ondemand_logical_size_calculation(lsn, cause, ctx) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index d72982a9a0..17df39733f 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -251,6 +251,7 @@ impl Layer { layer .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) + .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self)) .await } @@ -1211,8 +1212,10 @@ impl DownloadedLayer { // this will be a permanent failure .context("load layer"); - if res.is_err() { + if let Err(e) = res.as_ref() { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); + // TODO(#5815): we are not logging all errors, so temporarily log them here as well + tracing::error!("layer loading failed permanently: {e:#}"); } res }; @@ -1291,6 +1294,7 @@ impl ResidentLayer { } /// Loads all keys stored in the layer. Returns key, lsn and value size. + #[tracing::instrument(skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 36629e0655..bbb96cb172 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -36,7 +36,6 @@ use std::time::{Duration, Instant, SystemTime}; use crate::context::{ AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, }; -use crate::deletion_queue::DeletionQueueClient; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, @@ -50,6 +49,7 @@ use crate::tenant::{ metadata::{save_metadata, TimelineMetadata}, par_fsync, }; +use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError}; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; @@ -247,7 +247,7 @@ pub struct Timeline { /// the flush finishes. You can use that to wait for the flush to finish. layer_flush_start_tx: tokio::sync::watch::Sender, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel - layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>, + layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. @@ -374,6 +374,19 @@ pub enum PageReconstructError { WalRedo(anyhow::Error), } +#[derive(thiserror::Error, Debug)] +enum FlushLayerError { + /// Timeline cancellation token was cancelled + #[error("timeline shutting down")] + Cancelled, + + #[error(transparent)] + PageReconstructError(#[from] PageReconstructError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + impl std::fmt::Debug for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { @@ -891,15 +904,16 @@ impl Timeline { self.launch_eviction_task(background_jobs_can_start); } + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub async fn shutdown(self: &Arc, freeze_and_flush: bool) { + pub(crate) async fn flush_and_shutdown(&self) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Signal any subscribers to our cancellation token to drop out - tracing::debug!("Cancelling CancellationToken"); - self.cancel.cancel(); - - // prevent writes to the InMemoryLayer + // Stop ingesting data, so that we are not still writing to an InMemoryLayer while + // trying to flush tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), @@ -908,40 +922,70 @@ impl Timeline { ) .await; + // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + self.last_record_lsn.shutdown(); + // now all writers to InMemory layer are gone, do the final flush if requested - if freeze_and_flush { - match self.freeze_and_flush().await { - Ok(()) => {} - Err(e) => { - warn!("failed to freeze and flush: {e:#}"); - return; // TODO: should probably drain remote timeline client anyways? + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + if let Err(e) = client.wait_completion().await { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to flush to remote storage: {e:#}"); + } } } - - // drain the upload queue - let res = if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.wait_completion().await - } else { - Ok(()) - }; - - if let Err(e) = res { - warn!("failed to await for frozen and flushed uploads: {e:#}"); + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); } } + self.shutdown().await; + } + + /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of + /// the graceful [`Timeline::flush_and_shutdown`] function. + pub(crate) async fn shutdown(&self) { + // Signal any subscribers to our cancellation token to drop out + tracing::debug!("Cancelling CancellationToken"); + self.cancel.cancel(); + // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel // while doing so. self.last_record_lsn.shutdown(); + // Shut down the layer flush task before the remote client, as one depends on the other + task_mgr::shutdown_tasks( + Some(TaskKind::LayerFlushTask), + Some(self.tenant_id), + Some(self.timeline_id), + ) + .await; + + // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in + // case our caller wants to use that for a deletion + if let Some(remote_client) = self.remote_client.as_ref() { + match remote_client.stop() { + Ok(()) => {} + Err(StopError::QueueUninitialized) => { + // Shutting down during initialization is legal + } + } + } + tracing::debug!("Waiting for tasks..."); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await; // Finally wait until any gate-holders are complete @@ -985,7 +1029,12 @@ impl Timeline { reason, backtrace: backtrace_str, }; - self.set_state(broken_state) + self.set_state(broken_state); + + // Although the Broken state is not equivalent to shutdown() (shutdown will be called + // later when this tenant is detach or the process shuts down), firing the cancellation token + // here avoids the need for other tasks to watch for the Broken state explicitly. + self.cancel.cancel(); } pub fn current_state(&self) -> TimelineState { @@ -1741,12 +1790,8 @@ impl Timeline { // delay will be terminated by a timeout regardless. let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; - // no extra cancellation here, because nothing really waits for this to complete compared - // to spawn_ondemand_logical_size_calculation. - let cancel = CancellationToken::new(); - let calculated_size = match self_clone - .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel) + .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx) .await { Ok(s) => s, @@ -1815,7 +1860,6 @@ impl Timeline { lsn: Lsn, cause: LogicalSizeCalculationCause, ctx: RequestContext, - cancel: CancellationToken, ) -> oneshot::Receiver> { let (sender, receiver) = oneshot::channel(); let self_clone = Arc::clone(self); @@ -1836,7 +1880,7 @@ impl Timeline { false, async move { let res = self_clone - .logical_size_calculation_task(lsn, cause, &ctx, cancel) + .logical_size_calculation_task(lsn, cause, &ctx) .await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors @@ -1852,58 +1896,28 @@ impl Timeline { lsn: Lsn, cause: LogicalSizeCalculationCause, ctx: &RequestContext, - cancel: CancellationToken, ) -> Result { span::debug_assert_current_span_has_tenant_and_timeline_id(); - let mut timeline_state_updates = self.subscribe_for_state_updates(); + let _guard = self.gate.enter(); + let self_calculation = Arc::clone(self); let mut calculation = pin!(async { - let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cause, cancel, &ctx) + .calculate_logical_size(lsn, cause, &ctx) .await }); - let timeline_state_cancellation = async { - loop { - match timeline_state_updates.changed().await { - Ok(()) => { - let new_state = timeline_state_updates.borrow().clone(); - match new_state { - // we're running this job for active timelines only - TimelineState::Active => continue, - TimelineState::Broken { .. } - | TimelineState::Stopping - | TimelineState::Loading => { - break format!("aborted because timeline became inactive (new state: {new_state:?})") - } - } - } - Err(_sender_dropped_error) => { - // can't happen, the sender is not dropped as long as the Timeline exists - break "aborted because state watch was dropped".to_string(); - } - } - } - }; - - let taskmgr_shutdown_cancellation = async { - task_mgr::shutdown_watcher().await; - "aborted because task_mgr shutdown requested".to_string() - }; tokio::select! { res = &mut calculation => { res } - reason = timeline_state_cancellation => { - debug!(reason = reason, "cancelling calculation"); - cancel.cancel(); + _ = self.cancel.cancelled() => { + debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - reason = taskmgr_shutdown_cancellation => { - debug!(reason = reason, "cancelling calculation"); - cancel.cancel(); + _ = task_mgr::shutdown_watcher() => { + debug!("cancelling logical size calculation for task shutdown"); calculation.await } } @@ -1917,7 +1931,6 @@ impl Timeline { &self, up_to_lsn: Lsn, cause: LogicalSizeCalculationCause, - cancel: CancellationToken, ctx: &RequestContext, ) -> Result { info!( @@ -1960,7 +1973,7 @@ impl Timeline { }; let timer = storage_time_metrics.start_timer(); let logical_size = self - .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) + .get_current_logical_size_non_incremental(up_to_lsn, ctx) .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); @@ -2373,6 +2386,10 @@ impl Timeline { info!("started flush loop"); loop { tokio::select! { + _ = self.cancel.cancelled() => { + info!("shutting down layer flush task"); + break; + }, _ = task_mgr::shutdown_watcher() => { info!("shutting down layer flush task"); break; @@ -2384,6 +2401,14 @@ impl Timeline { let timer = self.metrics.flush_time_histo.start_timer(); let flush_counter = *layer_flush_start_rx.borrow(); let result = loop { + if self.cancel.is_cancelled() { + info!("dropping out of flush loop for timeline shutdown"); + // Note: we do not bother transmitting into [`layer_flush_done_tx`], because + // anyone waiting on that will respect self.cancel as well: they will stop + // waiting at the same time we as drop out of this loop. + return; + } + let layer_to_flush = { let guard = self.layers.read().await; guard.layer_map().frozen_layers.front().cloned() @@ -2392,9 +2417,18 @@ impl Timeline { let Some(layer_to_flush) = layer_to_flush else { break Ok(()); }; - if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await { - error!("could not flush frozen layer: {err:?}"); - break Err(err); + match self.flush_frozen_layer(layer_to_flush, ctx).await { + Ok(()) => {} + Err(FlushLayerError::Cancelled) => { + info!("dropping out of flush loop for timeline shutdown"); + return; + } + err @ Err( + FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_), + ) => { + error!("could not flush frozen layer: {err:?}"); + break err; + } } }; // Notify any listeners that we're done @@ -2443,7 +2477,17 @@ impl Timeline { } } trace!("waiting for flush to complete"); - rx.changed().await?; + tokio::select! { + rx_e = rx.changed() => { + rx_e?; + }, + // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring + // the notification from [`flush_loop`] that it completed. + _ = self.cancel.cancelled() => { + tracing::info!("Cancelled layer flush due on timeline shutdown"); + return Ok(()) + } + }; trace!("done") } } @@ -2458,7 +2502,7 @@ impl Timeline { self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), FlushLayerError> { // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -2483,6 +2527,11 @@ impl Timeline { let (partitioning, _lsn) = self .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx) .await?; + + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + // For image layers, we add them immediately into the layer map. ( self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) @@ -2514,6 +2563,10 @@ impl Timeline { ) }; + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); @@ -2523,6 +2576,10 @@ impl Timeline { let metadata = { let mut guard = self.layers.write().await; + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); if disk_consistent_lsn != old_disk_consistent_lsn { diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index e3aad22e40..79bc434a2a 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, + pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, @@ -326,8 +327,7 @@ impl Timeline { match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_timeline_cached_layer_accesses(cancel, ctx) - .await; + self.imitate_timeline_cached_layer_accesses(ctx).await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()) } } @@ -367,21 +367,12 @@ impl Timeline { /// Recompute the values which would cause on-demand downloads during restart. #[instrument(skip_all)] - async fn imitate_timeline_cached_layer_accesses( - &self, - cancel: &CancellationToken, - ctx: &RequestContext, - ) { + async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size let size = self - .calculate_logical_size( - lsn, - LogicalSizeCalculationCause::EvictionTaskImitation, - cancel.clone(), - ctx, - ) + .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx) .instrument(info_span!("calculate_logical_size")) .await; @@ -407,9 +398,16 @@ impl Timeline { if size.is_err() { // ignore, see above comment } else { - warn!( - "failed to collect keyspace but succeeded in calculating logical size: {e:#}" - ); + match e { + CollectKeySpaceError::Cancelled => { + // Shutting down, ignore + } + err => { + warn!( + "failed to collect keyspace but succeeded in calculating logical size: {err:#}" + ); + } + } } } } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e1d699fcba..ccdf621c30 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -43,8 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use crate::config::PageServerConf; use crate::metrics::{ - WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, - WAL_REDO_WAIT_TIME, + WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, }; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::repository::Key; @@ -207,11 +207,8 @@ impl PostgresRedoManager { ) -> anyhow::Result { let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; - let start_time = Instant::now(); let mut n_attempts = 0u32; loop { - let lock_time = Instant::now(); - // launch the WAL redo process on first use let proc: Arc = { let proc_guard = self.redo_process.read().unwrap(); @@ -236,7 +233,7 @@ impl PostgresRedoManager { } }; - WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); + let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres let buf_tag = BufferTag { rel, blknum }; @@ -244,8 +241,7 @@ impl PostgresRedoManager { .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout) .context("apply_wal_records"); - let end_time = Instant::now(); - let duration = end_time.duration_since(lock_time); + let duration = started_at.elapsed(); let len = records.len(); let nbytes = records.iter().fold(0, |acumulator, record| { @@ -596,21 +592,21 @@ trait CloseFileDescriptors: CommandExt { impl CloseFileDescriptors for C { fn close_fds(&mut self) -> &mut Command { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. unsafe { self.pre_exec(move || { - // SAFETY: Code executed inside pre_exec should have async-signal-safety, - // which means it should be safe to execute inside a signal handler. - // The precise meaning depends on platform. See `man signal-safety` - // for the linux definition. - // - // The set_fds_cloexec_threadsafe function is documented to be - // async-signal-safe. - // - // Aside from this function, the rest of the code is re-entrant and - // doesn't make any syscalls. We're just passing constants. - // - // NOTE: It's easy to indirectly cause a malloc or lock a mutex, - // which is not async-signal-safe. Be careful. close_fds::set_fds_cloexec_threadsafe(3, &[]); Ok(()) }) @@ -667,10 +663,10 @@ impl WalRedoProcess { .close_fds() .spawn_no_leak_child(tenant_id) .context("spawn process")?; - + WAL_REDO_PROCESS_COUNTERS.started.inc(); let mut child = scopeguard::guard(child, |child| { error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(); + child.kill_and_wait(WalRedoKillCause::Startup); }); let stdin = child.stdin.take().unwrap(); @@ -1001,7 +997,7 @@ impl Drop for WalRedoProcess { self.child .take() .expect("we only do this once") - .kill_and_wait(); + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); self.stderr_logger_cancel.cancel(); // no way to wait for stderr_logger_task from Drop because that is async only } @@ -1037,16 +1033,19 @@ impl NoLeakChild { }) } - fn kill_and_wait(mut self) { + fn kill_and_wait(mut self, cause: WalRedoKillCause) { let child = match self.child.take() { Some(child) => child, None => return, }; - Self::kill_and_wait_impl(child); + Self::kill_and_wait_impl(child, cause); } - #[instrument(skip_all, fields(pid=child.id()))] - fn kill_and_wait_impl(mut child: Child) { + #[instrument(skip_all, fields(pid=child.id(), ?cause))] + fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { + scopeguard::defer! { + WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); + } let res = child.kill(); if let Err(e) = res { // This branch is very unlikely because: @@ -1091,7 +1090,7 @@ impl Drop for NoLeakChild { // This thread here is going to outlive of our dropper. let span = tracing::info_span!("walredo", %tenant_id); let _entered = span.enter(); - Self::kill_and_wait_impl(child); + Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); }) .await }); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 0a944a6bf0..cc09fb849d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -19,7 +19,10 @@ #include "access/xlog.h" #include "access/xlogutils.h" #include "storage/buf_internals.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" #include "c.h" +#include "postmaster/interrupt.h" #include "libpq-fe.h" #include "libpq/pqformat.h" @@ -61,23 +64,63 @@ int flush_every_n_requests = 8; int n_reconnect_attempts = 0; int max_reconnect_attempts = 60; +#define MAX_PAGESERVER_CONNSTRING_SIZE 256 + +typedef struct +{ + LWLockId lock; + pg_atomic_uint64 update_counter; + char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; +} PagestoreShmemState; + +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void walproposer_shmem_request(void); +#endif +static shmem_startup_hook_type prev_shmem_startup_hook; +static PagestoreShmemState *pagestore_shared; +static uint64 pagestore_local_counter = 0; +static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; + bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; static bool pageserver_flush(void); static void pageserver_disconnect(void); - -static pqsigfunc prev_signal_handler; +static bool +CheckPageserverConnstring(char **newval, void **extra, GucSource source) +{ + return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE; +} static void -pageserver_sighup_handler(SIGNAL_ARGS) +AssignPageserverConnstring(const char *newval, void *extra) { - if (prev_signal_handler) - { - prev_signal_handler(postgres_signal_arg); - } - neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring); - pageserver_disconnect(); + if(!pagestore_shared) + return; + LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); + strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); + pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1); + LWLockRelease(pagestore_shared->lock); +} + +static bool +CheckConnstringUpdated() +{ + if(!pagestore_shared) + return false; + return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); +} + +static void +ReloadConnstring() +{ + if(!pagestore_shared) + return; + LWLockAcquire(pagestore_shared->lock, LW_SHARED); + strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); + pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter); + LWLockRelease(pagestore_shared->lock); } static bool @@ -91,6 +134,11 @@ pageserver_connect(int elevel) Assert(!connected); + if(CheckConnstringUpdated()) + { + ReloadConnstring(); + } + /* * Connect using the connection string we got from the * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment @@ -110,7 +158,7 @@ pageserver_connect(int elevel) n++; } keywords[n] = "dbname"; - values[n] = page_server_connstring; + values[n] = local_pageserver_connstring; n++; keywords[n] = NULL; values[n] = NULL; @@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request) { StringInfoData req_buff; + if(CheckConnstringUpdated()) + { + pageserver_disconnect(); + ReloadConnstring(); + } + /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) { @@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request) { while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { + HandleMainLoopInterrupts(); n_reconnect_attempts += 1; pg_usleep(RECONNECT_INTERVAL_USEC); } @@ -391,7 +446,8 @@ pageserver_flush(void) return true; } -page_server_api api = { +page_server_api api = +{ .send = pageserver_send, .flush = pageserver_flush, .receive = pageserver_receive @@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source) return **newval == '\0' || HexDecodeString(id, *newval, 16); } +static Size +PagestoreShmemSize(void) +{ + return sizeof(PagestoreShmemState); +} + +static bool +PagestoreShmemInit(void) +{ + bool found; + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + pagestore_shared = ShmemInitStruct("libpagestore shared state", + PagestoreShmemSize(), + &found); + if(!found) + { + pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock); + pg_atomic_init_u64(&pagestore_shared->update_counter, 0); + AssignPageserverConnstring(page_server_connstring, NULL); + } + LWLockRelease(AddinShmemInitLock); + return found; +} + +static void +pagestore_shmem_startup_hook(void) +{ + if(prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + PagestoreShmemInit(); +} + +static void +pagestore_shmem_request(void) +{ +#if PG_VERSION_NUM >= 150000 + if(prev_shmem_request_hook) + prev_shmem_request_hook(); +#endif + + RequestAddinShmemSpace(PagestoreShmemSize()); + RequestNamedLWLockTranche("neon_libpagestore", 1); +} + +static void +pagestore_prepare_shmem(void) +{ +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = pagestore_shmem_request; +#else + pagestore_shmem_request(); +#endif + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = pagestore_shmem_startup_hook; +} + /* * Module initialization function */ void pg_init_libpagestore(void) { + pagestore_prepare_shmem(); + DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, @@ -418,7 +534,7 @@ pg_init_libpagestore(void) "", PGC_SIGHUP, 0, /* no flags required */ - NULL, NULL, NULL); + CheckPageserverConnstring, AssignPageserverConnstring, NULL); DefineCustomStringVariable("neon.timeline_id", "Neon timeline_id the server is running on", @@ -499,7 +615,5 @@ pg_init_libpagestore(void) redo_read_buffer_filter = neon_redo_read_buffer_filter; } - prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler); - lfc_init(); } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 7dc304d7ac..d7a8edca79 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,8 @@ //! User credentials used in authentication. -use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError}; +use crate::{ + auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options, +}; use itertools::Itertools; use pq_proto::StartupMessageParams; use std::collections::HashSet; @@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> { pub user: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. pub project: Option, + + pub cache_key: String, } impl ClientCredentials<'_> { @@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> { ClientCredentials { user: "", project: None, + cache_key: "".to_string(), } } @@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> { info!(user, project = project.as_deref(), "credentials"); - Ok(Self { user, project }) + let cache_key = format!( + "{}{}", + project.as_deref().unwrap_or(""), + neon_options(params).unwrap_or("".to_string()) + ); + + Ok(Self { + user, + project, + cache_key, + }) } } @@ -176,6 +191,7 @@ mod tests { let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); + assert_eq!(creds.cache_key, "foo"); Ok(()) } @@ -303,4 +319,23 @@ mod tests { _ => panic!("bad error: {err:?}"), } } + + #[test] + fn parse_neon_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"), + ]); + + let sni = Some("project.localhost"); + let common_names = Some(["localhost".into()].into()); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("project")); + assert_eq!( + creds.cache_key, + "projectneon_endpoint_type:read_write neon_lsn:0/2" + ); + + Ok(()) + } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 28d0d95c5b..7d1b7eaaae 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -80,6 +80,9 @@ struct ProxyCliArgs { /// cache for `wake_compute` api method (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] wake_compute_cache: String, + /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + wake_compute_lock: String, /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, @@ -220,10 +223,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl), })); + let config::WakeComputeLockOptions { + shards, + permits, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new( + console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) + .unwrap(), + )); + tokio::spawn(locks.garbage_collect_worker(epoch)); + let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); - let api = console::provider::neon::Api::new(endpoint, caches); + let api = console::provider::neon::Api::new(endpoint, caches, locks); auth::BackendType::Console(Cow::Owned(api), ()) } AuthBackend::Postgres => { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index e96b79ed92..53eb0e3a76 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -3,6 +3,7 @@ use crate::{ cancellation::CancelClosure, console::errors::WakeComputeError, error::{io_error, UserFacingError}, + proxy::is_neon_param, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; @@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option { #[allow(unstable_name_collisions)] let options: String = params .options_raw()? - .filter(|opt| parse_endpoint_param(opt).is_none()) + .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt)) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); @@ -313,5 +314,11 @@ mod tests { let params = StartupMessageParams::new([("options", "project = foo")]); assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + + let params = StartupMessageParams::new([( + "options", + "project = foo neon_endpoint_type:read_write neon_lsn:0/2", + )]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9607ecd153..bd00123905 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -264,6 +264,79 @@ impl FromStr for CacheOptions { } } +/// Helper for cmdline cache options parsing. +pub struct WakeComputeLockOptions { + /// The number of shards the lock map should have + pub shards: usize, + /// The number of allowed concurrent requests for each endpoitn + pub permits: usize, + /// Garbage collection epoch + pub epoch: Duration, + /// Lock timeout + pub timeout: Duration, +} + +impl WakeComputeLockOptions { + /// Default options for [`crate::console::provider::ApiLocks`]. + pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; + + // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; + + /// Parse lock options passed via cmdline. + /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`]. + fn parse(options: &str) -> anyhow::Result { + let mut shards = None; + let mut permits = None; + let mut epoch = None; + let mut timeout = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "shards" => shards = Some(value.parse()?), + "permits" => permits = Some(value.parse()?), + "epoch" => epoch = Some(humantime::parse_duration(value)?), + "timeout" => timeout = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + + // these dont matter if lock is disabled + if let Some(0) = permits { + timeout = Some(Duration::default()); + epoch = Some(Duration::default()); + shards = Some(2); + } + + let out = Self { + shards: shards.context("missing `shards`")?, + permits: permits.context("missing `permits`")?, + epoch: epoch.context("missing `epoch`")?, + timeout: timeout.context("missing `timeout`")?, + }; + + ensure!(out.shards > 1, "shard count must be > 1"); + ensure!( + out.shards.is_power_of_two(), + "shard count must be a power of two" + ); + + Ok(out) + } +} + +impl FromStr for WakeComputeLockOptions { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse cache lock options '{options}'"); + Self::parse(options).with_context(error) + } +} + #[cfg(test)] mod tests { use super::*; @@ -288,4 +361,42 @@ mod tests { Ok(()) } + + #[test] + fn test_parse_lock_options() -> anyhow::Result<()> { + let WakeComputeLockOptions { + epoch, + permits, + shards, + timeout, + } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?; + assert_eq!(epoch, Duration::from_secs(10 * 60)); + assert_eq!(timeout, Duration::from_secs(1)); + assert_eq!(shards, 32); + assert_eq!(permits, 4); + + let WakeComputeLockOptions { + epoch, + permits, + shards, + timeout, + } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?; + assert_eq!(epoch, Duration::from_secs(60)); + assert_eq!(timeout, Duration::from_millis(100)); + assert_eq!(shards, 16); + assert_eq!(permits, 8); + + let WakeComputeLockOptions { + epoch, + permits, + shards, + timeout, + } = "permits=0".parse()?; + assert_eq!(epoch, Duration::ZERO); + assert_eq!(timeout, Duration::ZERO); + assert_eq!(shards, 2); + assert_eq!(permits, 0); + + Ok(()) + } } diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 0e5eaaf845..6da627389e 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -13,5 +13,10 @@ pub mod caches { pub use super::provider::{ApiCaches, NodeInfoCache}; } +/// Various cache-related types. +pub mod locks { + pub use super::provider::ApiLocks; +} + /// Console's management API. pub mod mgmt; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 32c3467092..54bcd1f081 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,7 +8,13 @@ use crate::{ compute, scram, }; use async_trait::async_trait; -use std::sync::Arc; +use dashmap::DashMap; +use std::{sync::Arc, time::Duration}; +use tokio::{ + sync::{OwnedSemaphorePermit, Semaphore}, + time::Instant, +}; +use tracing::info; pub mod errors { use crate::{ @@ -149,6 +155,9 @@ pub mod errors { #[error(transparent)] ApiError(ApiError), + + #[error("Timeout waiting to acquire wake compute lock")] + TimeoutError, } // This allows more useful interactions than `#[from]`. @@ -158,6 +167,17 @@ pub mod errors { } } + impl From for WakeComputeError { + fn from(_: tokio::sync::AcquireError) -> Self { + WakeComputeError::TimeoutError + } + } + impl From for WakeComputeError { + fn from(_: tokio::time::error::Elapsed) -> Self { + WakeComputeError::TimeoutError + } + } + impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; @@ -167,6 +187,8 @@ pub mod errors { BadComputeAddress(_) => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. ApiError(e) => e.to_string_client(), + + TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), } } } @@ -178,6 +200,7 @@ pub struct ConsoleReqExtra<'a> { pub session_id: uuid::Uuid, /// Name of client application, if set. pub application_name: Option<&'a str>, + pub options: Option<&'a str>, } /// Auth secret which is managed by the cloud. @@ -232,3 +255,145 @@ pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, } + +/// Various caches for [`console`](super). +pub struct ApiLocks { + name: &'static str, + node_locks: DashMap, Arc>, + permits: usize, + timeout: Duration, + registered: prometheus::IntCounter, + unregistered: prometheus::IntCounter, + reclamation_lag: prometheus::Histogram, + lock_acquire_lag: prometheus::Histogram, +} + +impl ApiLocks { + pub fn new( + name: &'static str, + permits: usize, + shards: usize, + timeout: Duration, + ) -> prometheus::Result { + let registered = prometheus::IntCounter::with_opts( + prometheus::Opts::new( + "semaphores_registered", + "Number of semaphores registered in this api lock", + ) + .namespace(name), + )?; + prometheus::register(Box::new(registered.clone()))?; + let unregistered = prometheus::IntCounter::with_opts( + prometheus::Opts::new( + "semaphores_unregistered", + "Number of semaphores unregistered in this api lock", + ) + .namespace(name), + )?; + prometheus::register(Box::new(unregistered.clone()))?; + let reclamation_lag = prometheus::Histogram::with_opts( + prometheus::HistogramOpts::new( + "reclamation_lag_seconds", + "Time it takes to reclaim unused semaphores in the api lock", + ) + .namespace(name) + // 1us -> 65ms + // benchmarks on my mac indicate it's usually in the range of 256us and 512us + .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), + )?; + prometheus::register(Box::new(reclamation_lag.clone()))?; + let lock_acquire_lag = prometheus::Histogram::with_opts( + prometheus::HistogramOpts::new( + "semaphore_acquire_seconds", + "Time it takes to reclaim unused semaphores in the api lock", + ) + .namespace(name) + // 0.1ms -> 6s + .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), + )?; + prometheus::register(Box::new(lock_acquire_lag.clone()))?; + + Ok(Self { + name, + node_locks: DashMap::with_shard_amount(shards), + permits, + timeout, + lock_acquire_lag, + registered, + unregistered, + reclamation_lag, + }) + } + + pub async fn get_wake_compute_permit( + &self, + key: &Arc, + ) -> Result { + if self.permits == 0 { + return Ok(WakeComputePermit { permit: None }); + } + let now = Instant::now(); + let semaphore = { + // get fast path + if let Some(semaphore) = self.node_locks.get(key) { + semaphore.clone() + } else { + self.node_locks + .entry(key.clone()) + .or_insert_with(|| { + self.registered.inc(); + Arc::new(Semaphore::new(self.permits)) + }) + .clone() + } + }; + let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; + + self.lock_acquire_lag + .observe((Instant::now() - now).as_secs_f64()); + + Ok(WakeComputePermit { + permit: Some(permit??), + }) + } + + pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + if self.permits == 0 { + return; + } + + let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + loop { + for (i, shard) in self.node_locks.shards().iter().enumerate() { + interval.tick().await; + // temporary lock a single shard and then clear any semaphores that aren't currently checked out + // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked + // therefore releasing it is safe from race conditions + info!( + name = self.name, + shard = i, + "performing epoch reclamation on api lock" + ); + let mut lock = shard.write(); + let timer = self.reclamation_lag.start_timer(); + let count = lock + .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) + .count(); + drop(lock); + self.unregistered.inc_by(count as u64); + timer.observe_duration() + } + } + } +} + +pub struct WakeComputePermit { + // None if the lock is disabled + permit: Option, +} + +impl WakeComputePermit { + pub fn should_check_cache(&self) -> bool { + self.permit.is_some() + } +} diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 927fea0a13..0dc7c71534 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,12 +3,12 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo, }; use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; -use std::net::SocketAddr; +use std::{net::SocketAddr, sync::Arc}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -17,12 +17,17 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, caches: &'static ApiCaches, + locks: &'static ApiLocks, jwt: String, } impl Api { /// Construct an API object containing the auth parameters. - pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self { + pub fn new( + endpoint: http::Endpoint, + caches: &'static ApiCaches, + locks: &'static ApiLocks, + ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), @@ -30,6 +35,7 @@ impl Api { Self { endpoint, caches, + locks, jwt, } } @@ -99,6 +105,7 @@ impl Api { .query(&[ ("application_name", extra.application_name), ("project", Some(project)), + ("options", extra.options), ]) .build()?; @@ -151,7 +158,7 @@ impl super::Api for Api { extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials, ) -> Result { - let key = creds.project().expect("impossible"); + let key: &str = &creds.cache_key; // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); @@ -162,9 +169,22 @@ impl super::Api for Api { return Ok(cached); } + let key: Arc = key.into(); + + let permit = self.locks.get_wake_compute_permit(&key).await?; + + // after getting back a permit - it's possible the cache was filled + // double check + if permit.should_check_cache() { + if let Some(cached) = self.caches.node_info.get(&key) { + info!(key = &*key, "found cached compute node info"); + return Ok(cached); + } + } + let node = self.do_wake_compute(extra, creds).await?; - let (_, cached) = self.caches.node_info.insert(key.into(), node); - info!(key = key, "created a cache entry for compute node info"); + let (_, cached) = self.caches.node_info.insert(key.clone(), node); + info!(key = &*key, "created a cache entry for compute node info"); Ok(cached) } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 803b278482..4a95e473f6 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(clippy::undocumented_unsafe_blocks)] + use std::convert::Infallible; use anyhow::{bail, Context}; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 884aae1651..a1ebf03545 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -15,10 +15,12 @@ use crate::{ use anyhow::{bail, Context}; use async_trait::async_trait; use futures::TryFutureExt; +use itertools::Itertools; use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec}; -use once_cell::sync::Lazy; +use once_cell::sync::{Lazy, OnceCell}; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use prometheus::{register_histogram_vec, HistogramVec}; +use regex::Regex; use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant}; use tokio::{ io::{AsyncRead, AsyncWrite, AsyncWriteExt}, @@ -568,6 +570,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { "api_console_other_server_error" } WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", + WakeComputeError::TimeoutError => "timeout_error", }; NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); } @@ -881,9 +884,12 @@ impl Client<'_, S> { allow_self_signed_compute, } = self; + let console_options = neon_options(params); + let extra = console::ConsoleReqExtra { session_id, // aka this connection's id application_name: params.get("application_name"), + options: console_options.as_deref(), }; let mut latency_timer = LatencyTimer::new(mode.protocol_label()); @@ -945,3 +951,27 @@ impl Client<'_, S> { proxy_pass(stream, node.stream, &aux).await } } + +pub fn neon_options(params: &StartupMessageParams) -> Option { + #[allow(unstable_name_collisions)] + let options: String = params + .options_raw()? + .filter(|opt| is_neon_param(opt)) + .sorted() // we sort it to use as cache key + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + // Don't even bother with empty options. + if options.is_empty() { + return None; + } + + Some(options) +} + +pub fn is_neon_param(bytes: &str) -> bool { + static RE: OnceCell = OnceCell::new(); + RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap()); + + RE.get().unwrap().is_match(bytes) +} diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 142c32fb84..3ae4df46ef 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -440,6 +440,7 @@ fn helper_create_connect_info( let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), application_name: Some("TEST"), + options: None, }; let creds = auth::BackendType::Test(mechanism); (cache, extra, creds) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c5bfc32568..d09554a922 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ auth, console, - proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER}, + proxy::{ + neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, + NUM_DB_CONNECTIONS_OPENED_COUNTER, + }, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, }; use crate::{compute, config}; @@ -41,6 +44,7 @@ pub struct ConnInfo { pub dbname: String, pub hostname: String, pub password: String, + pub options: Option, } impl ConnInfo { @@ -401,26 +405,25 @@ async fn connect_to_compute( let tls = config.tls_config.as_ref(); let common_names = tls.and_then(|tls| tls.common_names.clone()); - let credential_params = StartupMessageParams::new([ + let params = StartupMessageParams::new([ ("user", &conn_info.username), ("database", &conn_info.dbname), ("application_name", APP_NAME), + ("options", conn_info.options.as_deref().unwrap_or("")), ]); let creds = config .auth_backend .as_ref() - .map(|_| { - auth::ClientCredentials::parse( - &credential_params, - Some(&conn_info.hostname), - common_names, - ) - }) + .map(|_| auth::ClientCredentials::parse(¶ms, Some(&conn_info.hostname), common_names)) .transpose()?; + + let console_options = neon_options(¶ms); + let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), application_name: Some(APP_NAME), + options: console_options.as_deref(), }; let node_info = creds diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8f7cf7fbaf..16736ac00d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -174,11 +174,23 @@ fn get_conn_info( } } + let pairs = connection_url.query_pairs(); + + let mut options = Option::None; + + for (key, value) in pairs { + if key == "options" { + options = Some(value.to_string()); + break; + } + } + Ok(ConnInfo { username: username.to_owned(), dbname: dbname.to_owned(), hostname: hostname.to_owned(), password: password.to_owned(), + options, }) } diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d892438633..777276a4d1 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -1,3 +1,5 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] pub mod checks; pub mod cloud_admin_api; pub mod garbage; diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index 2684d82365..bf4905aaa7 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -1,19 +1,20 @@ -use anyhow::{bail, Result}; -use utils::auth::{Claims, Scope}; +use utils::auth::{AuthError, Claims, Scope}; use utils::id::TenantId; -pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<(), AuthError> { match (&claims.scope, tenant_id) { - (Scope::Tenant, None) => { - bail!("Attempt to access management api with tenant scope. Permission denied") - } + (Scope::Tenant, None) => Err(AuthError( + "Attempt to access management api with tenant scope. Permission denied".into(), + )), (Scope::Tenant, Some(tenant_id)) => { if claims.tenant_id.unwrap() != tenant_id { - bail!("Tenant id mismatch. Permission denied") + return Err(AuthError("Tenant id mismatch. Permission denied".into())); } Ok(()) } - (Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"), + (Scope::PageServerApi, _) => Err(AuthError( + "PageServerApi scope makes no sense for Safekeeper".into(), + )), (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d476cf33ae..2e54380471 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME}; use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; -use utils::auth::{JwtAuth, Scope}; +use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; use utils::{ id::NodeId, logging::{self, LogFormat}, @@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> { None } Some(path) => { - info!("loading http auth JWT key from {path}"); - Some(Arc::new( - JwtAuth::from_key_path(path).context("failed to load the auth key")?, - )) + info!("loading http auth JWT key(s) from {path}"); + let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?; + Some(Arc::new(SwappableJwtAuth::new(jwt_auth))) } }; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 6d0ed8650d..d5333abae6 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -6,7 +6,7 @@ use std::str::FromStr; use std::str::{self}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, info_span, Instrument}; +use tracing::{debug, info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; @@ -165,26 +165,27 @@ impl postgres_backend::Handler .auth .as_ref() .expect("auth_type is configured but .auth of handler is missing"); - let data = - auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; + let data = auth + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?) + .map_err(|e| QueryError::Unauthorized(e.0))?; // The handler might be configured to allow only tenant scope tokens. if matches!(allowed_auth_scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) { - return Err(QueryError::Other(anyhow::anyhow!( - "passed JWT token is for full access, but only tenant scope is allowed" - ))); + return Err(QueryError::Unauthorized( + "passed JWT token is for full access, but only tenant scope is allowed".into(), + )); } if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { - return Err(QueryError::Other(anyhow::anyhow!( - "jwt token scope is Tenant, but tenant id is missing" - ))); + return Err(QueryError::Unauthorized( + "jwt token scope is Tenant, but tenant id is missing".into(), + )); } - info!( - "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", + debug!( + "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -263,7 +264,7 @@ impl SafekeeperPostgresHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<(), QueryError> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -275,7 +276,7 @@ impl SafekeeperPostgresHandler { .claims .as_ref() .expect("claims presence already checked"); - check_permission(claims, tenant_id) + check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } async fn handle_timeline_status( diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 06b903719e..c48b5330b3 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; use crate::SafeKeeperConf; use utils::{ - auth::JwtAuth, + auth::SwappableJwtAuth, http::{ endpoint::{self, auth_middleware, check_permission_with}, error::ApiError, @@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder if ALLOWLIST_ROUTES.contains(request.uri()) { None } else { - // Option> is always provided as data below, hence unwrap(). - request.data::>>().unwrap().as_deref() + // Option> is always provided as data below, hence unwrap(). + request + .data::>>() + .unwrap() + .as_deref() } })) } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index aa70cee591..3a086f1f54 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,3 +1,4 @@ +#![deny(clippy::undocumented_unsafe_blocks)] use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; @@ -6,7 +7,10 @@ use tokio::runtime::Runtime; use std::time::Duration; use storage_broker::Uri; -use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::{ + auth::SwappableJwtAuth, + id::{NodeId, TenantId, TenantTimelineId}, +}; mod auth; pub mod broker; @@ -69,7 +73,7 @@ pub struct SafeKeeperConf { pub wal_backup_enabled: bool, pub pg_auth: Option>, pub pg_tenant_only_auth: Option>, - pub http_auth: Option>, + pub http_auth: Option>, pub current_thread_runtime: bool, } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 934dda10b2..9ce9b049ba 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -111,7 +111,7 @@ impl WalReceivers { .count() } - /// Unregister walsender. + /// Unregister walreceiver. fn unregister(self: &Arc, id: WalReceiverId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; @@ -138,8 +138,8 @@ pub enum WalReceiverStatus { Streaming, } -/// Scope guard to access slot in WalSenders registry and unregister from it in -/// Drop. +/// Scope guard to access slot in WalReceivers registry and unregister from +/// it in Drop. pub struct WalReceiverGuard { id: WalReceiverId, walreceivers: Arc, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d00bf8c4d8..4057f620a0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -361,7 +361,6 @@ class PgProtocol: @dataclass class AuthKeys: - pub: str priv: str def generate_token(self, *, scope: str, **token_data: str) -> str: @@ -877,9 +876,31 @@ class NeonEnv: @cached_property def auth_keys(self) -> AuthKeys: - pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text() priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text() - return AuthKeys(pub=pub, priv=priv) + return AuthKeys(priv=priv) + + def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path): + # compare generate_auth_keys() in local_env.rs + subprocess.run( + ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path], + cwd=self.repo_dir, + check=True, + ) + + subprocess.run( + [ + "openssl", + "pkey", + "-in", + privkey_path, + "-pubout", + "-out", + pubkey_path, + ], + cwd=self.repo_dir, + check=True, + ) + del self.auth_keys def generate_endpoint_id(self) -> str: """ diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d2c3715c52..aff7959aa7 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session): assert res_json is None return res_json + def reload_auth_validation_keys(self): + res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys") + self.verbose_error(res) + def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 76b75c1caf..f729bdee98 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,12 +1,35 @@ +import os from contextlib import closing +from pathlib import Path import psycopg2 import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol -from fixtures.pageserver.http import PageserverApiException +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgProtocol, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.types import TenantId, TimelineId +def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient): + http_client.timeline_create( + pg_version=env.pg_version, + tenant_id=env.initial_tenant, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, + ) + + +def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient): + with pytest.raises( + PageserverApiException, + match="Forbidden: JWT authentication error", + ): + assert_client_authorized(env, http_client) + + def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() @@ -27,30 +50,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps.safe_psql("set FOO", password=pageserver_token) # tenant can create branches - tenant_http_client.timeline_create( - pg_version=env.pg_version, - tenant_id=env.initial_tenant, - new_timeline_id=TimelineId.generate(), - ancestor_timeline_id=env.initial_timeline, - ) + assert_client_authorized(env, tenant_http_client) + # console can create branches for tenant - pageserver_http_client.timeline_create( - pg_version=env.pg_version, - tenant_id=env.initial_tenant, - new_timeline_id=TimelineId.generate(), - ancestor_timeline_id=env.initial_timeline, - ) + assert_client_authorized(env, pageserver_http_client) # fail to create branch using token with different tenant_id - with pytest.raises( - PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" - ): - invalid_tenant_http_client.timeline_create( - pg_version=env.pg_version, - tenant_id=env.initial_tenant, - new_timeline_id=TimelineId.generate(), - ancestor_timeline_id=env.initial_timeline, - ) + with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"): + assert_client_authorized(env, invalid_tenant_http_client) # create tenant using management token pageserver_http_client.tenant_create(TenantId.generate()) @@ -58,7 +65,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): # fail to create tenant using tenant token with pytest.raises( PageserverApiException, - match="Forbidden: Attempt to access management api with tenant scope. Permission denied", + match="Forbidden: JWT authentication error", ): tenant_http_client.tenant_create(TenantId.generate()) @@ -82,6 +89,96 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (5000050000,) +def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") + env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") + + pageserver_token_old = env.auth_keys.generate_pageserver_token() + pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) + + pageserver_http_client_old.reload_auth_validation_keys() + + # This test is to ensure that the pageserver supports multiple keys. + # The neon_local tool generates one key pair at a hardcoded path by default. + # As a preparation for our test, move the public key of the key pair into a + # directory at the same location as the hardcoded path by: + # 1. moving the the file at `configured_pub_key_path` to a temporary location + # 2. creating a new directory at `configured_pub_key_path` + # 3. moving the file from the temporary location into the newly created directory + configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem" + os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file") + os.mkdir(configured_pub_key_path) + os.rename( + Path(env.repo_dir) / "auth_public_key.pem.file", + configured_pub_key_path / "auth_public_key_old.pem", + ) + + # Add a new key pair + # This invalidates env.auth_keys and makes them be regenerated + env.regenerate_keys_at( + Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem") + ) + + # Reload the keys on the pageserver side + pageserver_http_client_old.reload_auth_validation_keys() + + # We can continue doing things using the old token + assert_client_authorized(env, pageserver_http_client_old) + + pageserver_token_new = env.auth_keys.generate_pageserver_token() + pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new) + + # The new token also works + assert_client_authorized(env, pageserver_http_client_new) + + # Remove the old token and reload + os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem") + pageserver_http_client_old.reload_auth_validation_keys() + + # Reloading fails now with the old token, but the new token still works + assert_client_not_authorized(env, pageserver_http_client_old) + assert_client_authorized(env, pageserver_http_client_new) + + +def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") + env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") + + pageserver_token_old = env.auth_keys.generate_pageserver_token() + pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) + + pageserver_http_client_old.reload_auth_validation_keys() + + # Regenerate the keys + env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem")) + + # Reload the keys on the pageserver side + pageserver_http_client_old.reload_auth_validation_keys() + + # Next attempt fails as we use the old auth token + with pytest.raises( + PageserverApiException, + match="Forbidden: JWT authentication error", + ): + pageserver_http_client_old.reload_auth_validation_keys() + + # same goes for attempts trying to create a timeline + assert_client_not_authorized(env, pageserver_http_client_old) + + pageserver_token_new = env.auth_keys.generate_pageserver_token() + pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new) + + # timeline creation works with the new token + assert_client_authorized(env, pageserver_http_client_new) + + # reloading also works with the new token + pageserver_http_client_new.reload_auth_validation_keys() + + @pytest.mark.parametrize("auth_enabled", [False, True]) def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py new file mode 100644 index 0000000000..ba0624c730 --- /dev/null +++ b/test_runner/regress/test_bad_connection.py @@ -0,0 +1,60 @@ +import random +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*simulated connection error.*") + + pageserver_http = env.pageserver.http_client() + env.neon_cli.create_branch("test_compute_pageserver_connection_stress") + endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") + + # Enable failpoint after starting everything else up so that loading initial + # basebackup doesn't fail + pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + cur.execute("SELECT count(*) FROM foo") + assert cur.fetchone() == (100000,) + + end_time = time.time() + 30 + times_executed = 0 + while time.time() < end_time: + if random.random() < 0.5: + cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + else: + cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + cur.fetchall() + times_executed += 1 + log.info(f"Workload executed {times_executed} times") diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index b1b47b3f2c..23839a4dd1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -26,6 +26,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", + ".*layer loading failed permanently: load layer: .*", ] ) diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 31092b70b9..410bf03c2b 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -1,9 +1,13 @@ +import asyncio + from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind def test_change_pageserver(neon_env_builder: NeonEnvBuilder): + num_connections = 3 + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, @@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): alt_pageserver_id = env.pageservers[1].id env.pageservers[1].tenant_attach(env.initial_tenant) - pg_conn = endpoint.connect() - cur = pg_conn.cursor() + pg_conns = [endpoint.connect() for i in range(num_connections)] + curs = [pg_conn.cursor() for pg_conn in pg_conns] + + def execute(statement: str): + for cur in curs: + cur.execute(statement) + + def fetchone(): + results = [cur.fetchone() for cur in curs] + assert all(result == results[0] for result in results) + return results[0] # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute("CREATE TABLE foo (t text)") - cur.execute( + curs[0].execute("CREATE TABLE foo (t text)") + curs[0].execute( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) # Verify that the table is larger than shared_buffers - cur.execute( + curs[0].execute( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) - row = cur.fetchone() + row = curs[0].fetchone() assert row is not None log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (100000,) + execute("SELECT count(*) FROM foo") + assert fetchone() == (100000,) endpoint.reconfigure(pageserver_id=alt_pageserver_id) # Verify that the neon.pageserver_connstring GUC is set to the correct thing - cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") - connstring = cur.fetchone() + execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = fetchone() assert connstring is not None expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}" assert expected_connstring == expected_connstring @@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): 0 ].stop() # Stop the old pageserver just to make sure we're reading from the new one - cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (100000,) + execute("SELECT count(*) FROM foo") + assert fetchone() == (100000,) + + # Try failing back, and this time we will stop the current pageserver before reconfiguring + # the endpoint. Whereas the previous reconfiguration was like a healthy migration, this + # is more like what happens in an unexpected pageserver failure. + env.pageservers[0].start() + env.pageservers[1].stop() + + endpoint.reconfigure(pageserver_id=env.pageservers[0].id) + + execute("SELECT count(*) FROM foo") + assert fetchone() == (100000,) + + env.pageservers[0].stop() + env.pageservers[1].start() + + # Test a (former) bug where a child process spins without updating its connection string + # by executing a query separately. This query will hang until we issue the reconfigure. + async def reconfigure_async(): + await asyncio.sleep( + 1 + ) # Sleep for 1 second just to make sure we actually started our count(*) query + endpoint.reconfigure(pageserver_id=env.pageservers[1].id) + + def execute_count(): + execute("SELECT count(*) FROM FOO") + + async def execute_and_reconfigure(): + task_exec = asyncio.to_thread(execute_count) + task_reconfig = asyncio.create_task(reconfigure_async()) + await asyncio.gather( + task_exec, + task_reconfig, + ) + + asyncio.run(execute_and_reconfigure()) + assert fetchone() == (100000,) + + # One final check that nothing hangs + execute("SELECT count(*) FROM foo") + assert fetchone() == (100000,) diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index cb0e9a358c..ffab540e01 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -79,11 +79,27 @@ def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder): def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") - endpoint_main = env.endpoints.create_start("test_lsn_mapping") - log.info("postgres is running on 'test_lsn_mapping' branch") + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + # disable default GC and compaction + "gc_period": "1000 m", + "compaction_period": "0 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + } + ) + + timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id) + endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id) + timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0] + log.info("postgres is running on 'main' branch") cur = endpoint_main.connect().cursor() + + # Obtain an lsn before all write operations on this branch + start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()")) + # Create table, and insert rows, each in a separate transaction # Disable `synchronous_commit` to make this initialization go faster. # XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off` @@ -106,29 +122,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): cur.execute("INSERT INTO foo VALUES (-1)") # Wait until WAL is received by pageserver - wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id) with env.pageserver.http_client() as client: # Check edge cases # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 ) assert result["kind"] in ["present", "future"] + # make sure that we return a well advanced lsn here + assert Lsn(result["lsn"]) > start_lsn # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 ) assert result["kind"] == "past" + # make sure that we return the minimum lsn here at the start of the range + assert Lsn(result["lsn"]) < start_lsn # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] result = client.timeline_get_lsn_by_timestamp( - env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2 ) assert result["kind"] not in ["past", "nodata"] lsn = result["lsn"] @@ -136,12 +156,29 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Launch a new read-only node at that LSN, and check that only the rows # that were supposed to be committed at that point in time are visible. endpoint_here = env.endpoints.create_start( - branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn + branch_name="test_lsn_mapping", + endpoint_id="ep-lsn_mapping_read", + lsn=lsn, + tenant_id=tenant_id, ) assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i endpoint_here.stop_and_destroy() + # Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff + timeline_id_child = env.neon_cli.create_branch( + "test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping" + ) + + # Timestamp is in the unreachable past + probe_timestamp = tbl[0][1] - timedelta(hours=10) + result = client.timeline_get_lsn_by_timestamp( + tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2 + ) + assert result["kind"] == "past" + # make sure that we return the minimum lsn here at the start of the range + assert Lsn(result["lsn"]) >= last_flush_lsn + # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 3e5021ae06..c3f4ad476f 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -366,11 +366,17 @@ def test_deletion_queue_recovery( assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 if validate_before == ValidateBefore.VALIDATE: + # At this point, one or more DeletionLists have been written. We have set a failpoint + # to prevent them successfully executing, but we want to see them get validated. + # + # We await _some_ validations instead of _all_ validations, because our execution failpoint + # will prevent validation proceeding for any but the first DeletionList. Usually the workload + # just generates one, but if it generates two due to timing, then we must not expect that the + # second one will be validated. + def assert_some_validations(): + assert get_deletion_queue_validated(ps_http) > 0 - def assert_validation_complete(): - assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http) - - wait_until(20, 1, assert_validation_complete) + wait_until(20, 1, assert_some_validations) # The validatated keys statistic advances before the header is written, so we # also wait to see the header hit the disk: this seems paranoid but the race @@ -380,6 +386,11 @@ def test_deletion_queue_recovery( wait_until(20, 1, assert_header_written) + # If we will lose attachment, then our expectation on restart is that only the ones + # we already validated will execute. Act like only those were present in the queue. + if keep_attachment == KeepAttachment.LOSE: + before_restart_depth = get_deletion_queue_validated(ps_http) + log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") env.pageserver.stop(immediate=True) @@ -402,11 +413,13 @@ def test_deletion_queue_recovery( ps_http.deletion_queue_flush(execute=True) wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) - if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE: + if keep_attachment == KeepAttachment.KEEP: # - If we kept the attachment, then our pre-restart deletions should execute # because on re-attach they were from the immediately preceding generation - # - If we validated before restart, then the deletions should execute because the - # deletion queue header records a validated deletion list sequence number. + assert get_deletion_queue_executed(ps_http) == before_restart_depth + elif validate_before == ValidateBefore.VALIDATE: + # - If we validated before restart, then we should execute however many keys were + # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index d07b8dbe6b..65569f3bac 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -17,10 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB n_restarts = 10 scale = 10 - # Pageserver currently logs requests on non-active tenants at error level - # https://github.com/neondatabase/neon/issues/5784 - env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*") - def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e2a65ad150..3535c2cf94 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -25,6 +25,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +dashmap = { version = "5", default-features = false, features = ["raw-api"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures = { version = "0.3" } @@ -38,7 +39,7 @@ hex = { version = "0.4", features = ["serde"] } hyper = { version = "0.14", features = ["full"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } -log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } +log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } @@ -55,7 +56,6 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } smallvec = { version = "1", default-features = false, features = ["write"] } -standback = { version = "0.2", default-features = false, features = ["std"] } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } @@ -76,14 +76,13 @@ cc = { version = "1", default-features = false, features = ["parallel"] } either = { version = "1" } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } -log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } +log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } prost = { version = "0.11" } regex = { version = "1" } regex-syntax = { version = "0.7" } serde = { version = "1", features = ["alloc", "derive"] } -standback = { version = "0.2", default-features = false, features = ["std"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }