diff --git a/.config/hakari.toml b/.config/hakari.toml index 9991cd92b0..dcbc44cc33 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -21,13 +21,14 @@ platforms = [ # "x86_64-apple-darwin", # "x86_64-pc-windows-msvc", ] - [final-excludes] workspace-members = [ # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded # from depending on workspace-hack because most of the dependencies are not used. "vm_monitor", + # subzero-core is a stub crate that should be excluded from workspace-hack + "subzero-core", # All of these exist in libs and are not usually built independently. # Putting workspace hack there adds a bottleneck for cargo builds. "compute_api", diff --git a/.github/actions/prepare-for-subzero/action.yml b/.github/actions/prepare-for-subzero/action.yml new file mode 100644 index 0000000000..11beb11880 --- /dev/null +++ b/.github/actions/prepare-for-subzero/action.yml @@ -0,0 +1,28 @@ +name: 'Prepare current job for subzero' +description: > + Set git token to access `neondatabase/subzero` from cargo build, + and set `CARGO_NET_GIT_FETCH_WITH_CLI=true` env variable to use git CLI + +inputs: + token: + description: 'GitHub token with access to neondatabase/subzero' + required: true + +runs: + using: "composite" + + steps: + - name: Set git token for neondatabase/subzero + uses: pyTooling/Actions/with-post-step@2307b526df64d55e95884e072e49aac2a00a9afa # v5.1.0 + env: + SUBZERO_ACCESS_TOKEN: ${{ inputs.token }} + with: + main: | + git config --global url."https://x-access-token:${SUBZERO_ACCESS_TOKEN}@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" + cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a + post: | + git config --global --unset url."https://x-access-token:${SUBZERO_ACCESS_TOKEN}@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" + + - name: Set `CARGO_NET_GIT_FETCH_WITH_CLI=true` env variable + shell: bash -euxo pipefail {0} + run: echo "CARGO_NET_GIT_FETCH_WITH_CLI=true" >> ${GITHUB_ENV} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index b3e68ab606..1f2012358e 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -181,6 +181,8 @@ runs: # Ref https://github.com/neondatabase/neon/issues/4540 # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) cov_prefix=() + # Explicitly set LLVM_PROFILE_FILE to /dev/null to avoid writing *.profraw files + export LLVM_PROFILE_FILE=/dev/null else cov_prefix=() fi diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 94115572df..1b03dc9c03 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -86,6 +86,10 @@ jobs: with: submodules: true + - uses: ./.github/actions/prepare-for-subzero + with: + token: ${{ secrets.CI_ACCESS_TOKEN }} + - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT @@ -116,7 +120,7 @@ jobs: ARCH: ${{ inputs.arch }} SANITIZERS: ${{ inputs.sanitizers }} run: | - CARGO_FLAGS="--locked --features testing" + CARGO_FLAGS="--locked --features testing,rest_broker" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_PROFILE="" diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index 4f844b0bf6..af29e10e97 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -46,6 +46,10 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true + + - uses: ./.github/actions/prepare-for-subzero + with: + token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Cache cargo deps uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1 # v1.8.0 diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 2296807d2d..e43eec1133 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -54,6 +54,10 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: submodules: true + + - uses: ./.github/actions/prepare-for-subzero + with: + token: ${{ secrets.CI_ACCESS_TOKEN }} - name: Install build dependencies run: | diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cc9534f05d..f237a991cc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -87,22 +87,27 @@ jobs: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit - lint-openapi-spec: - runs-on: ubuntu-22.04 - needs: [ meta, check-permissions ] + lint-yamls: + needs: [ meta, check-permissions, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 with: egress-policy: audit + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} + + - run: make -C compute manifest-schema-validation - run: make lint-openapi-spec check-codestyle-python: @@ -217,28 +222,6 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit - validate-compute-manifest: - runs-on: ubuntu-22.04 - needs: [ meta, check-permissions ] - # We do need to run this in `.*-rc-pr` because of hotfixes. - if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Set up Node.js - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 - with: - node-version: '24' - - - name: Validate manifest against schema - run: | - make -C compute manifest-schema-validation - build-and-test-locally: needs: [ meta, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. @@ -649,6 +632,8 @@ jobs: BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm + secrets: | + SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }} provenance: false push: true pull: true diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 3e81183687..10ca1a1591 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,7 @@ jobs: check-macos-build: needs: [ check-permissions, files-changed ] uses: ./.github/workflows/build-macos.yml + secrets: inherit with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} rebuild_rust_code: ${{ fromJSON(needs.files-changed.outputs.rebuild_rust_code) }} diff --git a/.gitignore b/.gitignore index 4857972f1d..1e1c2316af 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,14 @@ docker-compose/docker-compose-parallel.yml *.o *.so *.Po +*.pid # pgindent typedef lists *.list + +# Node +**/node_modules/ + +# various files for local testing +/proxy/.subzero +local_proxy.json diff --git a/Cargo.lock b/Cargo.lock index 2f36790d30..f503b45577 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,6 +52,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "aligned-vec" version = "0.6.1" @@ -490,7 +496,7 @@ dependencies = [ "hex", "hmac", "http 0.2.9", - "http 1.1.0", + "http 1.3.1", "once_cell", "p256 0.11.1", "percent-encoding", @@ -631,7 +637,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.9", - "http 1.1.0", + "http 1.3.1", "pin-project-lite", "tokio", "tracing", @@ -649,7 +655,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http 1.1.0", + "http 1.3.1", "http-body 0.4.5", "http-body 1.0.0", "http-body-util", @@ -698,7 +704,7 @@ dependencies = [ "bytes", "form_urlencoded", "futures-util", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "http-body-util", "hyper 1.4.1", @@ -732,7 +738,7 @@ checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "http-body-util", "mime", @@ -756,7 +762,7 @@ dependencies = [ "form_urlencoded", "futures-util", "headers", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "http-body-util", "mime", @@ -1090,8 +1096,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684" dependencies = [ "clap", - "heck", - "indexmap 2.9.0", + "heck 0.5.0", + "indexmap 2.10.0", "log", "proc-macro2", "quote", @@ -1228,7 +1234,7 @@ version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -1290,8 +1296,14 @@ dependencies = [ name = "communicator" version = "0.1.0" dependencies = [ + "axum", "cbindgen", - "neon-shmem", + "http 1.3.1", + "measured", + "tokio", + "tracing", + "tracing-subscriber", + "utils", "workspace_hack", ] @@ -1301,7 +1313,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "indexmap 2.9.0", + "indexmap 2.10.0", "jsonwebtoken", "regex", "remote_storage", @@ -1334,8 +1346,11 @@ dependencies = [ "flate2", "futures", "hostname-validator", - "http 1.1.0", - "indexmap 2.9.0", + "http 1.3.1", + "http-body-util", + "hyper 1.4.1", + "hyper-util", + "indexmap 2.10.0", "itertools 0.10.5", "jsonwebtoken", "metrics", @@ -1357,6 +1372,7 @@ dependencies = [ "ring", "rlimit", "rust-ini", + "scopeguard", "serde", "serde_json", "serde_with", @@ -1367,7 +1383,7 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic", "tower 0.5.2", "tower-http", "tower-otel", @@ -1445,7 +1461,7 @@ name = "consumption_metrics" version = "0.1.0" dependencies = [ "chrono", - "rand 0.8.5", + "rand 0.9.1", "serde", ] @@ -1848,7 +1864,7 @@ dependencies = [ "bytes", "hex", "parking_lot 0.12.1", - "rand 0.8.5", + "rand 0.9.1", "smallvec", "tracing", "utils", @@ -1872,6 +1888,7 @@ dependencies = [ "diesel_derives", "itoa", "serde_json", + "uuid", ] [[package]] @@ -1968,7 +1985,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ "darling", "either", - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -2092,7 +2109,7 @@ dependencies = [ "itertools 0.10.5", "jsonwebtoken", "prometheus", - "rand 0.8.5", + "rand 0.9.1", "remote_storage", "serde", "serde_json", @@ -2533,6 +2550,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", +] + [[package]] name = "gettid" version = "0.1.3" @@ -2630,7 +2659,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.9", - "indexmap 2.9.0", + "indexmap 2.10.0", "slab", "tokio", "tokio-util", @@ -2648,8 +2677,8 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http 1.1.0", - "indexmap 2.9.0", + "http 1.3.1", + "indexmap 2.10.0", "slab", "tokio", "tokio-util", @@ -2730,7 +2759,7 @@ dependencies = [ "base64 0.21.7", "bytes", "headers-core", - "http 1.1.0", + "http 1.3.1", "httpdate", "mime", "sha1", @@ -2742,9 +2771,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4" dependencies = [ - "http 1.1.0", + "http 1.3.1", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -2820,9 +2855,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" dependencies = [ "bytes", "fnv", @@ -2847,7 +2882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" dependencies = [ "bytes", - "http 1.1.0", + "http 1.3.1", ] [[package]] @@ -2858,7 +2893,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "pin-project-lite", ] @@ -2902,7 +2937,7 @@ dependencies = [ "pprof", "regex", "routerify", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-pemfile 2.1.1", "serde", "serde_json", @@ -2982,7 +3017,7 @@ dependencies = [ "futures-channel", "futures-util", "h2 0.4.4", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "httparse", "httpdate", @@ -3015,7 +3050,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" dependencies = [ "futures-util", - "http 1.1.0", + "http 1.3.1", "hyper 1.4.1", "hyper-util", "rustls 0.22.4", @@ -3047,7 +3082,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "hyper 1.4.1", "pin-project-lite", @@ -3239,9 +3274,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -3267,7 +3302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.9.0", + "indexmap 2.10.0", "is-terminal", "itoa", "log", @@ -3290,7 +3325,7 @@ dependencies = [ "crossbeam-utils", "dashmap 6.1.0", "env_logger", - "indexmap 2.9.0", + "indexmap 2.10.0", "itoa", "log", "num-format", @@ -3606,9 +3641,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" dependencies = [ "autocfg", "scopeguard", @@ -3696,7 +3731,7 @@ version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -3757,7 +3792,7 @@ dependencies = [ "once_cell", "procfs", "prometheus", - "rand 0.8.5", + "rand 0.9.1", "rand_distr", "twox-hash", ] @@ -3846,7 +3881,12 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" name = "neon-shmem" version = "0.1.0" dependencies = [ + "libc", + "lock_api", "nix 0.30.1", + "rand 0.9.1", + "rand_distr", + "rustc-hash 2.1.1", "tempfile", "thiserror 1.0.69", "workspace_hack", @@ -4122,86 +4162,81 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.27.1" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7" +checksum = "aaf416e4cb72756655126f7dd7bb0af49c674f4c1b9903e80c009e0c37e552e6" dependencies = [ "futures-core", "futures-sink", "js-sys", "pin-project-lite", - "thiserror 1.0.69", + "thiserror 2.0.11", "tracing", ] [[package]] name = "opentelemetry-http" -version = "0.27.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80" +checksum = "50f6639e842a97dbea8886e3439710ae463120091e2e064518ba8e716e6ac36d" dependencies = [ "async-trait", "bytes", - "http 1.1.0", + "http 1.3.1", "opentelemetry", "reqwest", ] [[package]] name = "opentelemetry-otlp" -version = "0.27.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76" +checksum = "dbee664a43e07615731afc539ca60c6d9f1a9425e25ca09c57bc36c87c55852b" dependencies = [ - "async-trait", - "futures-core", - "http 1.1.0", + "http 1.3.1", "opentelemetry", "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", "prost 0.13.5", "reqwest", - "thiserror 1.0.69", + "thiserror 2.0.11", ] [[package]] name = "opentelemetry-proto" -version = "0.27.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" +checksum = "2e046fd7660710fe5a05e8748e70d9058dc15c94ba914e7c4faa7c728f0e8ddc" dependencies = [ "opentelemetry", "opentelemetry_sdk", "prost 0.13.5", - "tonic 0.12.3", + "tonic", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.27.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52" +checksum = "83d059a296a47436748557a353c5e6c5705b9470ef6c95cfc52c21a8814ddac2" [[package]] name = "opentelemetry_sdk" -version = "0.27.1" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8" +checksum = "11f644aa9e5e31d11896e024305d7e3c98a88884d9f8919dbf37a9991bc47a4b" dependencies = [ - "async-trait", "futures-channel", "futures-executor", "futures-util", - "glob", "opentelemetry", "percent-encoding", - "rand 0.8.5", + "rand 0.9.1", "serde_json", - "thiserror 1.0.69", + "thiserror 2.0.11", "tokio", "tokio-stream", - "tracing", ] [[package]] @@ -4234,6 +4269,30 @@ dependencies = [ "winapi", ] +[[package]] +name = "ouroboros" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" +dependencies = [ + "aliasable", + "ouroboros_macro", + "static_assertions", +] + +[[package]] +name = "ouroboros_macro" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "proc-macro2-diagnostics", + "quote", + "syn 2.0.100", +] + [[package]] name = "outref" version = "0.5.1" @@ -4297,14 +4356,14 @@ dependencies = [ "pageserver_client_grpc", "pageserver_page_api", "pprof", - "rand 0.8.5", + "rand 0.9.1", "reqwest", "serde", "serde_json", "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic", "tracing", "url", "utils", @@ -4363,7 +4422,7 @@ dependencies = [ "hashlink", "hex", "hex-literal", - "http 1.1.0", + "http 1.3.1", "http-utils", "humantime", "humantime-serde", @@ -4394,14 +4453,14 @@ dependencies = [ "pprof", "pq_proto", "procfs", - "rand 0.8.5", + "rand 0.9.1", "range-set-blaze", "regex", "remote_storage", "reqwest", "rpds", "rstest", - "rustls 0.23.27", + "rustls 0.23.29", "scopeguard", "send-future", "serde", @@ -4425,7 +4484,7 @@ dependencies = [ "tokio-tar", "tokio-util", "toml_edit", - "tonic 0.13.1", + "tonic", "tonic-reflection", "tower 0.5.2", "tracing", @@ -4461,7 +4520,7 @@ dependencies = [ "postgres_ffi_types", "postgres_versioninfo", "posthog_client_lite", - "rand 0.8.5", + "rand 0.9.1", "remote_storage", "reqwest", "serde", @@ -4511,7 +4570,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic", "tracing", "utils", "workspace_hack", @@ -4531,7 +4590,7 @@ dependencies = [ "once_cell", "pageserver_api", "pin-project-lite", - "rand 0.8.5", + "rand 0.9.1", "svg_fmt", "tokio", "tracing", @@ -4556,7 +4615,7 @@ dependencies = [ "thiserror 1.0.69", "tokio", "tokio-util", - "tonic 0.13.1", + "tonic", "tonic-build", "utils", "workspace_hack", @@ -4904,7 +4963,7 @@ dependencies = [ "fallible-iterator", "hmac", "memchr", - "rand 0.8.5", + "rand 0.9.1", "sha2", "stringprep", "tokio", @@ -4938,7 +4997,7 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-pemfile 2.1.1", "serde", "thiserror 1.0.69", @@ -5096,7 +5155,7 @@ dependencies = [ "bytes", "itertools 0.10.5", "postgres-protocol", - "rand 0.8.5", + "rand 0.9.1", "serde", "thiserror 1.0.69", "tokio", @@ -5130,6 +5189,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", + "version_check", + "yansi", +] + [[package]] name = "procfs" version = "0.16.0" @@ -5199,7 +5271,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -5220,7 +5292,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", - "heck", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -5316,7 +5388,7 @@ dependencies = [ "hex", "hmac", "hostname", - "http 1.1.0", + "http 1.3.1", "http-body-util", "http-utils", "humantime", @@ -5324,7 +5396,7 @@ dependencies = [ "hyper 0.14.30", "hyper 1.4.1", "hyper-util", - "indexmap 2.9.0", + "indexmap 2.10.0", "ipnet", "itertools 0.10.5", "itoa", @@ -5336,6 +5408,7 @@ dependencies = [ "metrics", "once_cell", "opentelemetry", + "ouroboros", "p256 0.13.2", "papaya", "parking_lot 0.12.1", @@ -5346,7 +5419,8 @@ dependencies = [ "postgres-protocol2", "postgres_backend", "pq_proto", - "rand 0.8.5", + "rand 0.9.1", + "rand_core 0.6.4", "rand_distr", "rcgen", "redis", @@ -5358,8 +5432,8 @@ dependencies = [ "reqwest-tracing", "rsa", "rstest", - "rustc-hash 1.1.0", - "rustls 0.23.27", + "rustc-hash 2.1.1", + "rustls 0.23.29", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", @@ -5372,6 +5446,7 @@ dependencies = [ "socket2", "strum_macros", "subtle", + "subzero-core", "thiserror 1.0.69", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -5451,6 +5526,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" version = "0.7.3" @@ -5475,6 +5556,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", +] + [[package]] name = "rand_chacha" version = "0.2.2" @@ -5495,6 +5586,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", +] + [[package]] name = "rand_core" version = "0.5.1" @@ -5514,13 +5615,22 @@ dependencies = [ ] [[package]] -name = "rand_distr" -version = "0.4.3" +name = "rand_core" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.3", +] + +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.8.5", + "rand 0.9.1", ] [[package]] @@ -5602,7 +5712,7 @@ dependencies = [ "num-bigint", "percent-encoding", "pin-project-lite", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-native-certs 0.8.0", "ryu", "sha1_smol", @@ -5642,14 +5752,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.3", - "regex-syntax 0.8.2", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", ] [[package]] @@ -5663,13 +5773,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.2", + "regex-syntax 0.8.5", ] [[package]] @@ -5686,9 +5796,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" @@ -5726,7 +5836,7 @@ dependencies = [ "metrics", "once_cell", "pin-project-lite", - "rand 0.8.5", + "rand 0.9.1", "reqwest", "scopeguard", "serde", @@ -5758,7 +5868,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "http-body-util", "hyper 1.4.1", @@ -5800,7 +5910,7 @@ checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3" dependencies = [ "anyhow", "async-trait", - "http 1.1.0", + "http 1.3.1", "reqwest", "serde", "thiserror 1.0.69", @@ -5817,7 +5927,7 @@ dependencies = [ "async-trait", "futures", "getrandom 0.2.11", - "http 1.1.0", + "http 1.3.1", "hyper 1.4.1", "parking_lot 0.11.2", "reqwest", @@ -5831,14 +5941,14 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.5" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2" +checksum = "d70ea85f131b2ee9874f0b160ac5976f8af75f3c9badfe0d955880257d10bd83" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", - "http 1.1.0", + "http 1.3.1", "matchit", "opentelemetry", "reqwest", @@ -6058,15 +6168,15 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.27" +version = "0.23.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321" +checksum = "2491382039b29b9b11ff08b76ff6c97cf287671dbb74f0be44bda389fffe9bd1" dependencies = [ "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.3", + "rustls-webpki 0.103.4", "subtle", "zeroize", ] @@ -6130,9 +6240,12 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] [[package]] name = "rustls-webpki" @@ -6157,9 +6270,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" dependencies = [ "ring", "rustls-pki-types", @@ -6197,13 +6310,14 @@ dependencies = [ "fail", "futures", "hex", - "http 1.1.0", + "http 1.3.1", "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", "jsonwebtoken", "metrics", + "nix 0.30.1", "once_cell", "pageserver_api", "parking_lot 0.12.1", @@ -6211,14 +6325,15 @@ dependencies = [ "postgres-protocol", "postgres_backend", "postgres_ffi", + "postgres_ffi_types", "postgres_versioninfo", "pprof", "pq_proto", - "rand 0.8.5", + "rand 0.9.1", "regex", "remote_storage", "reqwest", - "rustls 0.23.27", + "rustls 0.23.29", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -6255,7 +6370,7 @@ dependencies = [ "anyhow", "const_format", "pageserver_api", - "postgres_ffi", + "postgres_ffi_types", "postgres_versioninfo", "pq_proto", "serde", @@ -6408,7 +6523,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335" dependencies = [ "httpdate", "reqwest", - "rustls 0.23.27", + "rustls 0.23.29", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -6540,7 +6655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4" dependencies = [ "form_urlencoded", - "indexmap 2.9.0", + "indexmap 2.10.0", "itoa", "ryu", "serde", @@ -6621,7 +6736,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.9.0", + "indexmap 2.10.0", "serde", "serde_derive", "serde_json", @@ -6864,10 +6979,10 @@ dependencies = [ "once_cell", "parking_lot 0.12.1", "prost 0.13.5", - "rustls 0.23.27", + "rustls 0.23.29", "tokio", "tokio-rustls 0.26.2", - "tonic 0.13.1", + "tonic", "tonic-build", "tracing", "utils", @@ -6908,11 +7023,11 @@ dependencies = [ "pageserver_client", "postgres_connection", "posthog_client_lite", - "rand 0.8.5", + "rand 0.9.1", "regex", "reqwest", "routerify", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-native-certs 0.8.0", "safekeeper_api", "safekeeper_client", @@ -6931,6 +7046,7 @@ dependencies = [ "tokio-util", "tracing", "utils", + "uuid", "workspace_hack", ] @@ -6965,7 +7081,7 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-native-certs 0.8.0", "serde", "serde_json", @@ -7043,7 +7159,7 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", @@ -7056,6 +7172,10 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "subzero-core" +version = "3.0.1" + [[package]] name = "svg_fmt" version = "0.4.3" @@ -7500,7 +7620,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ "ring", - "rustls 0.23.27", + "rustls 0.23.29", "tokio", "tokio-postgres", "tokio-rustls 0.26.2", @@ -7551,7 +7671,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.27", + "rustls 0.23.29", "tokio", ] @@ -7650,34 +7770,13 @@ version = "0.22.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" dependencies = [ - "indexmap 2.9.0", + "indexmap 2.10.0", "serde", "serde_spanned", "toml_datetime", "winnow", ] -[[package]] -name = "tonic" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" -dependencies = [ - "async-trait", - "base64 0.22.1", - "bytes", - "http 1.1.0", - "http-body 1.0.0", - "http-body-util", - "percent-encoding", - "pin-project", - "prost 0.13.5", - "tokio-stream", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "tonic" version = "0.13.1" @@ -7690,7 +7789,7 @@ dependencies = [ "bytes", "flate2", "h2 0.4.4", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "http-body-util", "hyper 1.4.1", @@ -7735,7 +7834,7 @@ dependencies = [ "prost-types 0.13.5", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic", ] [[package]] @@ -7761,7 +7860,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.9.0", + "indexmap 2.10.0", "pin-project-lite", "slab", "sync_wrapper 1.0.1", @@ -7781,7 +7880,7 @@ dependencies = [ "base64 0.22.1", "bitflags 2.8.0", "bytes", - "http 1.1.0", + "http 1.3.1", "http-body 1.0.0", "mime", "pin-project-lite", @@ -7799,10 +7898,14 @@ checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-otel" -version = "0.2.0" -source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345000ea5ae33222624a8ccfdd88892c30db4d413a39c2d4bd714b77e0a4b23c" dependencies = [ - "http 1.1.0", + "axum", + "cfg-if", + "http 1.3.1", + "http-body 1.0.0", "opentelemetry", "pin-project", "tower-layer", @@ -7884,9 +7987,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.28.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053" +checksum = "ddcf5959f39507d0d04d6413119c04f33b623f4f951ebcbdddddfad2d0623a9c" dependencies = [ "js-sys", "once_cell", @@ -7983,7 +8086,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http 1.1.0", + "http 1.3.1", "httparse", "log", "rand 0.8.5", @@ -8002,7 +8105,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http 1.1.0", + "http 1.3.1", "httparse", "log", "rand 0.8.5", @@ -8094,7 +8197,7 @@ dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-pki-types", "url", "webpki-roots", @@ -8184,7 +8287,7 @@ dependencies = [ "postgres_connection", "pprof", "pq_proto", - "rand 0.8.5", + "rand 0.9.1", "regex", "scopeguard", "sentry", @@ -8204,6 +8307,7 @@ dependencies = [ "tracing-error", "tracing-subscriber", "tracing-utils", + "uuid", "walkdir", ] @@ -8346,6 +8450,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasite" version = "0.1.0" @@ -8703,6 +8816,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "workspace_hack" version = "0.1.0" @@ -8747,7 +8869,7 @@ dependencies = [ "hyper 0.14.30", "hyper 1.4.1", "hyper-util", - "indexmap 2.9.0", + "indexmap 2.10.0", "itertools 0.12.1", "lazy_static", "libc", @@ -8770,14 +8892,14 @@ dependencies = [ "proc-macro2", "prost 0.13.5", "quote", - "rand 0.8.5", + "rand 0.9.1", "regex", - "regex-automata 0.4.3", - "regex-syntax 0.8.2", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", "reqwest", - "rustls 0.23.27", + "rustls 0.23.29", "rustls-pki-types", - "rustls-webpki 0.103.3", + "rustls-webpki 0.103.4", "scopeguard", "sec1 0.7.3", "serde", @@ -8790,6 +8912,7 @@ dependencies = [ "subtle", "syn 2.0.100", "sync_wrapper 0.1.2", + "thiserror 2.0.11", "tikv-jemalloc-ctl", "tikv-jemalloc-sys", "time", @@ -8799,13 +8922,13 @@ dependencies = [ "tokio-stream", "tokio-util", "toml_edit", + "tonic", "tower 0.5.2", "tracing", "tracing-core", "tracing-log", "tracing-subscriber", "url", - "uuid", "zeroize", "zstd", "zstd-safe", @@ -8870,6 +8993,12 @@ version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "yasna" version = "0.5.2" diff --git a/Cargo.toml b/Cargo.toml index df2064a4a7..00efe79554 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ members = [ "libs/proxy/tokio-postgres2", "endpoint_storage", "pgxn/neon/communicator", + "proxy/subzero_core", ] [workspace.package] @@ -130,6 +131,7 @@ jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] } jsonwebtoken = "9" lasso = "0.7" libc = "0.2" +lock_api = "0.4.13" md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } @@ -141,10 +143,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.19" once_cell = "1.13" -opentelemetry = "0.27" -opentelemetry_sdk = "0.27" -opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.27" +opentelemetry = "0.30" +opentelemetry_sdk = "0.30" +opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.30" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -156,16 +158,18 @@ procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13.5" prost-types = "0.13.5" -rand = "0.8" +rand = "0.9" +# Remove after p256 is updated to 0.14. +rand_core = "=0.6" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_30"] } reqwest-middleware = "0.4" reqwest-retry = "0.7" routerify = "3" rpds = "0.13" -rustc-hash = "1.1.0" +rustc-hash = "2.1.1" rustls = { version = "0.23.16", default-features = false } rustls-pemfile = "2" rustls-pki-types = "1.11" @@ -210,15 +214,12 @@ tonic = { version = "0.13.1", default-features = false, features = ["channel", " tonic-reflection = { version = "0.13.1", features = ["server"] } tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } - -# This revision uses opentelemetry 0.27. There's no tag for it. -tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } - +tower-otel = { version = "0.6", features = ["axum"] } tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" tracing-log = "0.2" -tracing-opentelemetry = "0.28" +tracing-opentelemetry = "0.31" tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" diff --git a/Dockerfile b/Dockerfile index 55b87d4012..654ae72e56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,7 +63,14 @@ WORKDIR /home/nonroot COPY --chown=nonroot . . -RUN cargo chef prepare --recipe-path recipe.json +RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ + set -e \ + && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ + export CARGO_NET_GIT_FETCH_WITH_CLI=true && \ + git config --global url."https://$(cat /run/secrets/SUBZERO_ACCESS_TOKEN)@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero" && \ + cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a; \ + fi \ + && cargo chef prepare --recipe-path recipe.json # Main build image FROM $REPOSITORY/$IMAGE:$TAG AS build @@ -71,20 +78,33 @@ WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG ARG ADDITIONAL_RUSTFLAGS="" +ENV CARGO_FEATURES="default" # 3. Build cargo dependencies. Note that this step doesn't depend on anything else than # `recipe.json`, so the layer can be reused as long as none of the dependencies change. COPY --from=plan /home/nonroot/recipe.json recipe.json -RUN set -e \ +RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ + set -e \ + && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ + export CARGO_NET_GIT_FETCH_WITH_CLI=true && \ + git config --global url."https://$(cat /run/secrets/SUBZERO_ACCESS_TOKEN)@github.com/neondatabase/subzero".insteadOf "https://github.com/neondatabase/subzero"; \ + fi \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json # Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build' # layer, and the cargo dependencies built in the previous step. COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install COPY --chown=nonroot . . +COPY --chown=nonroot --from=plan /home/nonroot/proxy/Cargo.toml proxy/Cargo.toml +COPY --chown=nonroot --from=plan /home/nonroot/Cargo.lock Cargo.lock -RUN set -e \ +RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \ + set -e \ + && if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \ + export CARGO_FEATURES="rest_broker"; \ + fi \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ + --features $CARGO_FEATURES \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ diff --git a/Makefile b/Makefile index d07ac907b4..dc8bacc78e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package # managers. -POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install # Supported PostgreSQL versions POSTGRES_VERSIONS = v17 v16 v15 v14 @@ -14,7 +14,7 @@ POSTGRES_VERSIONS = v17 v16 v15 v14 # it is derived from BUILD_TYPE. # All intermediate build artifacts are stored here. -BUILD_DIR := build +BUILD_DIR := $(ROOT_PROJECT_DIR)/build ICU_PREFIX_DIR := /usr/local/icu @@ -212,7 +212,7 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \ INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \ - -C $(BUILD_DIR)/neon-v17 \ + -C $(BUILD_DIR)/pgxn-v17/neon \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent @@ -220,11 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit +build-tools/node_modules: build-tools/package.json + cd build-tools && $(if $(CI),npm ci,npm install) + touch build-tools/node_modules + .PHONY: lint-openapi-spec -lint-openapi-spec: +lint-openapi-spec: build-tools/node_modules # operation-2xx-response: pageserver timeline delete returns 404 on success find . -iname "openapi_spec.y*ml" -exec\ - docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\ + npx --prefix=build-tools/ redocly\ --skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\ --skip-rule=no-server-example.com --skip-rule=operation-2xx-response\ lint {} \+ diff --git a/build-tools/Dockerfile b/build-tools/Dockerfile index 2ed7bb4f36..b5fe642e6f 100644 --- a/build-tools/Dockerfile +++ b/build-tools/Dockerfile @@ -188,6 +188,12 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Install node +ENV NODE_VERSION=24 +RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \ + && apt install -y nodejs \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ @@ -311,14 +317,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools rustfmt clippy && \ - cargo install rustfilt --version ${RUSTFILT_VERSION} --locked && \ - cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} --locked && \ - cargo install cargo-deny --version ${CARGO_DENY_VERSION} --locked && \ - cargo install cargo-hack --version ${CARGO_HACK_VERSION} --locked && \ - cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} --locked && \ - cargo install cargo-chef --version ${CARGO_CHEF_VERSION} --locked && \ - cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} --locked \ - --features postgres-bundled --no-default-features && \ + cargo install rustfilt --locked --version ${RUSTFILT_VERSION} && \ + cargo install cargo-hakari --locked --version ${CARGO_HAKARI_VERSION} && \ + cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ + cargo install cargo-hack --locked --version ${CARGO_HACK_VERSION} && \ + cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \ + cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \ + cargo install diesel_cli --locked --version ${CARGO_DIESEL_CLI_VERSION} \ + --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git diff --git a/build-tools/package-lock.json b/build-tools/package-lock.json new file mode 100644 index 0000000000..b2c44ed9b4 --- /dev/null +++ b/build-tools/package-lock.json @@ -0,0 +1,3189 @@ +{ + "name": "build-tools", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "build-tools", + "devDependencies": { + "@redocly/cli": "1.34.4", + "@sourcemeta/jsonschema": "10.0.0" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", + "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.27.1", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz", + "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/runtime": { + "version": "7.27.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz", + "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@emotion/is-prop-valid": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.2.tgz", + "integrity": "sha512-uNsoYd37AFmaCdXlg6EYD1KaPOaRWRByMCYzbKUX4+hhMfrxdVSelShywL4JVaAeM/eHUOSprYBQls+/neX3pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@emotion/memoize": "^0.8.1" + } + }, + "node_modules/@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@exodus/schemasafe": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@exodus/schemasafe/-/schemasafe-1.3.0.tgz", + "integrity": "sha512-5Aap/GaRupgNx/feGBwLLTVv8OQFfv3pq2lPRzPg9R+IOBnDgghTGW7l7EuVXOvg5cc/xSAlRW8rBrjIC3Nvqw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@faker-js/faker": { + "version": "7.6.0", + "resolved": "https://registry.npmjs.org/@faker-js/faker/-/faker-7.6.0.tgz", + "integrity": "sha512-XK6BTq1NDMo9Xqw/YkYyGjSsg44fbNwYRx7QK2CuoQgyy+f1rrTDHoExVM5PsyXCtfl2vs2vVJ0MN0yN6LppRw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0", + "npm": ">=6.0.0" + } + }, + "node_modules/@humanwhocodes/momoa": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@humanwhocodes/momoa/-/momoa-2.0.4.tgz", + "integrity": "sha512-RE815I4arJFtt+FVeU1Tgp9/Xvecacji8w/V6XtXsWWH/wz/eNkNbhb+ny/+PlVZjV0rxQpRSQKNKE3lcktHEA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=10.10.0" + } + }, + "node_modules/@jest/schemas": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/@jest/schemas/-/schemas-29.6.3.tgz", + "integrity": "sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@sinclair/typebox": "^0.27.8" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jsep-plugin/assignment": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz", + "integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 10.16.0" + }, + "peerDependencies": { + "jsep": "^0.4.0||^1.0.0" + } + }, + "node_modules/@jsep-plugin/regex": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz", + "integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 10.16.0" + }, + "peerDependencies": { + "jsep": "^0.4.0||^1.0.0" + } + }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/api-logs": { + "version": "0.53.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.53.0.tgz", + "integrity": "sha512-8HArjKx+RaAI8uEIgcORbZIPklyh1YLjPSBus8hjRmvLi6DeFzgOcdZ7KwPabKj8mXF8dX0hyfAyGfycz0DbFw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/@opentelemetry/context-async-hooks": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-async-hooks/-/context-async-hooks-1.26.0.tgz", + "integrity": "sha512-HedpXXYzzbaoutw6DFLWLDket2FwLkLpil4hGCZ1xYEIMTcivdfwEOISgdbLEWyG3HW52gTq2V9mOVJrONgiwg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/core": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-1.26.0.tgz", + "integrity": "sha512-1iKxXXE8415Cdv0yjG3G6hQnB5eVEsJce3QaawX8SjDn0mAS0ZM8fAbZZJD4ajvhC15cePvosSCut404KrIIvQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "1.27.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/exporter-trace-otlp-http": { + "version": "0.53.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-trace-otlp-http/-/exporter-trace-otlp-http-0.53.0.tgz", + "integrity": "sha512-m7F5ZTq+V9mKGWYpX8EnZ7NjoqAU7VemQ1E2HAG+W/u0wpY1x0OmbxAXfGKFHCspdJk8UKlwPGrpcB8nay3P8A==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0", + "@opentelemetry/otlp-exporter-base": "0.53.0", + "@opentelemetry/otlp-transformer": "0.53.0", + "@opentelemetry/resources": "1.26.0", + "@opentelemetry/sdk-trace-base": "1.26.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.0.0" + } + }, + "node_modules/@opentelemetry/otlp-exporter-base": { + "version": "0.53.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.53.0.tgz", + "integrity": "sha512-UCWPreGQEhD6FjBaeDuXhiMf6kkBODF0ZQzrk/tuQcaVDJ+dDQ/xhJp192H9yWnKxVpEjFrSSLnpqmX4VwX+eA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0", + "@opentelemetry/otlp-transformer": "0.53.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.0.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer": { + "version": "0.53.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.53.0.tgz", + "integrity": "sha512-rM0sDA9HD8dluwuBxLetUmoqGJKSAbWenwD65KY9iZhUxdBHRLrIdrABfNDP7aiTjcgK8XFyTn5fhDz7N+W6DA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.53.0", + "@opentelemetry/core": "1.26.0", + "@opentelemetry/resources": "1.26.0", + "@opentelemetry/sdk-logs": "0.53.0", + "@opentelemetry/sdk-metrics": "1.26.0", + "@opentelemetry/sdk-trace-base": "1.26.0", + "protobufjs": "^7.3.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/propagator-b3": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-b3/-/propagator-b3-1.26.0.tgz", + "integrity": "sha512-vvVkQLQ/lGGyEy9GT8uFnI047pajSOVnZI2poJqVGD3nJ+B9sFGdlHNnQKophE3lHfnIH0pw2ubrCTjZCgIj+Q==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/propagator-jaeger": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-jaeger/-/propagator-jaeger-1.26.0.tgz", + "integrity": "sha512-DelFGkCdaxA1C/QA0Xilszfr0t4YbGd3DjxiCDPh34lfnFr+VkkrjV9S8ZTJvAzfdKERXhfOxIKBoGPJwoSz7Q==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/resources": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-1.26.0.tgz", + "integrity": "sha512-CPNYchBE7MBecCSVy0HKpUISEeJOniWqcHaAHpmasZ3j9o6V3AyBzhRc90jdmemq0HOxDr6ylhUbDhBqqPpeNw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0", + "@opentelemetry/semantic-conventions": "1.27.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-logs": { + "version": "0.53.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.53.0.tgz", + "integrity": "sha512-dhSisnEgIj/vJZXZV6f6KcTnyLDx/VuQ6l3ejuZpMpPlh9S1qMHiZU9NMmOkVkwwHkMy3G6mEBwdP23vUZVr4g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.53.0", + "@opentelemetry/core": "1.26.0", + "@opentelemetry/resources": "1.26.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.4.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-metrics": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-1.26.0.tgz", + "integrity": "sha512-0SvDXmou/JjzSDOjUmetAAvcKQW6ZrvosU0rkbDGpXvvZN+pQF6JbK/Kd4hNdK4q/22yeruqvukXEJyySTzyTQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0", + "@opentelemetry/resources": "1.26.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-base": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-1.26.0.tgz", + "integrity": "sha512-olWQldtvbK4v22ymrKLbIcBi9L2SpMO84sCPY54IVsJhP9fRsxJT194C/AVaAuJzLE30EdhhM1VmvVYR7az+cw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "1.26.0", + "@opentelemetry/resources": "1.26.0", + "@opentelemetry/semantic-conventions": "1.27.0" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-node": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-node/-/sdk-trace-node-1.26.0.tgz", + "integrity": "sha512-Fj5IVKrj0yeUwlewCRwzOVcr5avTuNnMHWf7GPc1t6WaT78J6CJyF3saZ/0RkZfdeNO8IcBl/bNcWMVZBMRW8Q==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/context-async-hooks": "1.26.0", + "@opentelemetry/core": "1.26.0", + "@opentelemetry/propagator-b3": "1.26.0", + "@opentelemetry/propagator-jaeger": "1.26.0", + "@opentelemetry/sdk-trace-base": "1.26.0", + "semver": "^7.5.2" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/semantic-conventions": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.27.0.tgz", + "integrity": "sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@redocly/ajv": { + "version": "8.11.2", + "resolved": "https://registry.npmjs.org/@redocly/ajv/-/ajv-8.11.2.tgz", + "integrity": "sha512-io1JpnwtIcvojV7QKDUSIuMN/ikdOUd1ReEnUnMKGfDVridQZ31J0MmIuqwuRjWDZfmvr+Q0MqCcfHM2gTivOg==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2", + "uri-js-replace": "^1.0.1" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/@redocly/cli": { + "version": "1.34.4", + "resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.4.tgz", + "integrity": "sha512-seH/GgrjSB1EeOsgJ/4Ct6Jk2N7sh12POn/7G8UQFARMyUMJpe1oHtBwT2ndfp4EFCpgBAbZ/82Iw6dwczNxEA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@opentelemetry/api": "1.9.0", + "@opentelemetry/exporter-trace-otlp-http": "0.53.0", + "@opentelemetry/resources": "1.26.0", + "@opentelemetry/sdk-trace-node": "1.26.0", + "@opentelemetry/semantic-conventions": "1.27.0", + "@redocly/config": "^0.22.0", + "@redocly/openapi-core": "1.34.4", + "@redocly/respect-core": "1.34.4", + "abort-controller": "^3.0.0", + "chokidar": "^3.5.1", + "colorette": "^1.2.0", + "core-js": "^3.32.1", + "dotenv": "16.4.7", + "form-data": "^4.0.0", + "get-port-please": "^3.0.1", + "glob": "^7.1.6", + "handlebars": "^4.7.6", + "mobx": "^6.0.4", + "pluralize": "^8.0.0", + "react": "^17.0.0 || ^18.2.0 || ^19.0.0", + "react-dom": "^17.0.0 || ^18.2.0 || ^19.0.0", + "redoc": "2.5.0", + "semver": "^7.5.2", + "simple-websocket": "^9.0.0", + "styled-components": "^6.0.7", + "yargs": "17.0.1" + }, + "bin": { + "openapi": "bin/cli.js", + "redocly": "bin/cli.js" + }, + "engines": { + "node": ">=18.17.0", + "npm": ">=9.5.0" + } + }, + "node_modules/@redocly/config": { + "version": "0.22.2", + "resolved": "https://registry.npmjs.org/@redocly/config/-/config-0.22.2.tgz", + "integrity": "sha512-roRDai8/zr2S9YfmzUfNhKjOF0NdcOIqF7bhf4MVC5UxpjIysDjyudvlAiVbpPHp3eDRWbdzUgtkK1a7YiDNyQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@redocly/openapi-core": { + "version": "1.34.4", + "resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.4.tgz", + "integrity": "sha512-hf53xEgpXIgWl3b275PgZU3OTpYh1RoD2LHdIfQ1JzBNTWsiNKczTEsI/4Tmh2N1oq9YcphhSMyk3lDh85oDjg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@redocly/ajv": "^8.11.2", + "@redocly/config": "^0.22.0", + "colorette": "^1.2.0", + "https-proxy-agent": "^7.0.5", + "js-levenshtein": "^1.1.6", + "js-yaml": "^4.1.0", + "minimatch": "^5.0.1", + "pluralize": "^8.0.0", + "yaml-ast-parser": "0.0.43" + }, + "engines": { + "node": ">=18.17.0", + "npm": ">=9.5.0" + } + }, + "node_modules/@redocly/respect-core": { + "version": "1.34.4", + "resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.4.tgz", + "integrity": "sha512-MitKyKyQpsizA4qCVv+MjXL4WltfhFQAoiKiAzrVR1Kusro3VhYb6yJuzoXjiJhR0ukLP5QOP19Vcs7qmj9dZg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@faker-js/faker": "^7.6.0", + "@redocly/ajv": "8.11.2", + "@redocly/openapi-core": "1.34.4", + "better-ajv-errors": "^1.2.0", + "colorette": "^2.0.20", + "concat-stream": "^2.0.0", + "cookie": "^0.7.2", + "dotenv": "16.4.7", + "form-data": "4.0.0", + "jest-diff": "^29.3.1", + "jest-matcher-utils": "^29.3.1", + "js-yaml": "4.1.0", + "json-pointer": "^0.6.2", + "jsonpath-plus": "^10.0.6", + "open": "^10.1.0", + "openapi-sampler": "^1.6.1", + "outdent": "^0.8.0", + "set-cookie-parser": "^2.3.5", + "undici": "^6.21.1" + }, + "engines": { + "node": ">=18.17.0", + "npm": ">=9.5.0" + } + }, + "node_modules/@redocly/respect-core/node_modules/colorette": { + "version": "2.0.20", + "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz", + "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@redocly/respect-core/node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dev": true, + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/@sinclair/typebox": { + "version": "0.27.8", + "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", + "integrity": "sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@sourcemeta/jsonschema": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-10.0.0.tgz", + "integrity": "sha512-NyRjy3JxFrcDU9zci4fTe4dhoUZu61UNONgxJ13hmhaUAYF51gYvVEoWpDtl1ckikdboMuAm/QVeelh/+B8hGQ==", + "cpu": [ + "x64", + "arm64" + ], + "dev": true, + "license": "AGPL-3.0", + "os": [ + "darwin", + "linux", + "win32" + ], + "bin": { + "jsonschema": "cli.js" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sourcemeta" + } + }, + "node_modules/@types/json-schema": { + "version": "7.0.15", + "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", + "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "24.0.13", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.13.tgz", + "integrity": "sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.8.0" + } + }, + "node_modules/@types/stylis": { + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/@types/stylis/-/stylis-4.2.5.tgz", + "integrity": "sha512-1Xve+NMN7FWjY14vLoY5tL3BVEQ/n42YLwaqJIPYhotZ9uBHt87VceMwWQpzmdEt2TNXIorIFG+YeCUUW7RInw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/trusted-types": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", + "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", + "dev": true, + "license": "MIT", + "optional": true + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "dev": true, + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/ajv": { + "version": "8.17.1", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz", + "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "license": "MIT", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "dev": true, + "license": "ISC", + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "dev": true, + "license": "Python-2.0" + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "dev": true, + "license": "MIT" + }, + "node_modules/better-ajv-errors": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/better-ajv-errors/-/better-ajv-errors-1.2.0.tgz", + "integrity": "sha512-UW+IsFycygIo7bclP9h5ugkNH8EjCSgqyFB/yQ4Hqqa1OEYDtb0uFIkYE0b6+CjkgJYVM5UKI/pJPxjYe9EZlA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@babel/code-frame": "^7.16.0", + "@humanwhocodes/momoa": "^2.0.2", + "chalk": "^4.1.2", + "jsonpointer": "^5.0.0", + "leven": "^3.1.0 < 4" + }, + "engines": { + "node": ">= 12.13.0" + }, + "peerDependencies": { + "ajv": "4.11.8 - 8" + } + }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/brace-expansion": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/buffer-from": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/call-me-maybe": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-me-maybe/-/call-me-maybe-1.0.2.tgz", + "integrity": "sha512-HpX65o1Hnr9HH25ojC1YGs7HCQLq0GCOibSaWER0eNpgJ/Z1MZv2mTc7+xh6WOPxbRVcmgbv4hGU+uSQ/2xFZQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/camelize": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/camelize/-/camelize-1.0.1.tgz", + "integrity": "sha512-dU+Tx2fsypxTgtLoE36npi3UqcjSSMNYfkqgmoEhtZrraP5VWq0K7FkWVTYa8eMPtnU/G2txVsfdCJTn9uzpuQ==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/chalk": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", + "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/classnames": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/classnames/-/classnames-2.5.1.tgz", + "integrity": "sha512-saHYOzhIQs6wy2sVxTM6bUDsQO4F50V9RQ22qBpEdCW+I+/Wmke2HOl6lS6dTpdxVhb88/I6+Hs+438c3lfUow==", + "dev": true, + "license": "MIT" + }, + "node_modules/cliui": { + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz", + "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.0", + "wrap-ansi": "^7.0.0" + } + }, + "node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true, + "license": "MIT" + }, + "node_modules/colorette": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colorette/-/colorette-1.4.0.tgz", + "integrity": "sha512-Y2oEozpomLn7Q3HFP7dpww7AtMJplbM9lGZP6RDfHqmbeRjiwRg4n6VM6j4KLmRke85uWEI7JqF17f3pqdRA0g==", + "dev": true, + "license": "MIT" + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dev": true, + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true, + "license": "MIT" + }, + "node_modules/concat-stream": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz", + "integrity": "sha512-MWufYdFw53ccGjCA+Ol7XJYpAlW6/prSMzuPOTRnJGcGzuhLn4Scrz7qf6o8bROZ514ltazcIFJZevcfbo0x7A==", + "dev": true, + "engines": [ + "node >= 6.0" + ], + "license": "MIT", + "dependencies": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.0.2", + "typedarray": "^0.0.6" + } + }, + "node_modules/cookie": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", + "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/core-js": { + "version": "3.44.0", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.44.0.tgz", + "integrity": "sha512-aFCtd4l6GvAXwVEh3XbbVqJGHDJt0OZRa+5ePGx3LLwi12WfexqQxcsohb2wgsa/92xtl19Hd66G/L+TaAxDMw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/core-js" + } + }, + "node_modules/css-color-keywords": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/css-color-keywords/-/css-color-keywords-1.0.0.tgz", + "integrity": "sha512-FyyrDHZKEjXDpNJYvVsV960FiqQyXc/LlYmsxl2BcdMb2WPx0OGRVgTg55rPSyLSNMqP52R9r8geSp7apN3Ofg==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=4" + } + }, + "node_modules/css-to-react-native": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/css-to-react-native/-/css-to-react-native-3.2.0.tgz", + "integrity": "sha512-e8RKaLXMOFii+02mOlqwjbD00KSEKqblnpO9e++1aXS1fPQOpS1YoqdVHBqPjHNoxeF2mimzVqawm2KCbEdtHQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "camelize": "^1.0.0", + "css-color-keywords": "^1.0.0", + "postcss-value-parser": "^4.0.2" + } + }, + "node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", + "dev": true, + "license": "MIT" + }, + "node_modules/debug": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", + "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/decko": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz", + "integrity": "sha512-m8FnyHXV1QX+S1cl+KPFDIl6NMkxtKsy6+U/aYyjrOqWMuwAwYWu7ePqrsUHtDR5Y8Yk2pi/KIDSgF+vT4cPOQ==", + "dev": true + }, + "node_modules/default-browser": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.2.1.tgz", + "integrity": "sha512-WY/3TUME0x3KPYdRRxEJJvXRHV4PyPoUsxtZa78lwItwRQRHhd2U9xOscaT/YTf8uCXIAjeJOFBVEh/7FtD8Xg==", + "dev": true, + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.0.tgz", + "integrity": "sha512-A6p/pu/6fyBcA1TRz/GqWYPViplrftcW2gZC9q79ngNCKAeR/X3gcEdXQHl4KNXV+3wgIJ1CPkJQ3IHM6lcsyA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/diff-sequences": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", + "integrity": "sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/dompurify": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz", + "integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==", + "dev": true, + "license": "(MPL-2.0 OR Apache-2.0)", + "optionalDependencies": { + "@types/trusted-types": "^2.0.7" + } + }, + "node_modules/dotenv": { + "version": "16.4.7", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz", + "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true, + "license": "MIT" + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es6-promise": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.3.1.tgz", + "integrity": "sha512-SOp9Phqvqn7jtEUxPWdWfWoLmyt2VaJ6MpvP9Comy1MceMXqE6bxvaTu4iaxpYYPzhny28Lc+M87/c2cPK6lDg==", + "dev": true, + "license": "MIT" + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", + "dev": true, + "license": "MIT" + }, + "node_modules/fast-deep-equal": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", + "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/fast-safe-stringify": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz", + "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==", + "dev": true, + "license": "MIT" + }, + "node_modules/fast-uri": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz", + "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "BSD-3-Clause", + "peer": true + }, + "node_modules/fast-xml-parser": { + "version": "4.5.3", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.3.tgz", + "integrity": "sha512-RKihhV+SHsIUGXObeVy9AXiBbFwkVk7Syp8XgwN5U3JV416+Gwp/GO9i0JYKmikykgz/UHRrrV4ROuZEo/T0ig==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT", + "dependencies": { + "strnum": "^1.1.1" + }, + "bin": { + "fxparser": "src/cli/cli.js" + } + }, + "node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "dev": true, + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/foreach": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/foreach/-/foreach-2.0.6.tgz", + "integrity": "sha512-k6GAGDyqLe9JaebCsFCoudPPWfihKu8pylYXRlqP1J7ms39iPoTtk2fviNglIeQEwdh0bQeKJ01ZPyuyQvKzwg==", + "dev": true, + "license": "MIT" + }, + "node_modules/form-data": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz", + "integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", + "dev": true, + "license": "ISC" + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "dev": true, + "license": "ISC", + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-port-please": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/get-port-please/-/get-port-please-3.2.0.tgz", + "integrity": "sha512-I9QVvBw5U/hw3RmWpYKRumUeaDgxTPd401x364rLmWBJcOQ753eov1eTgzDqRG9bqFIfDc7gfzcQEWrUri3o1A==", + "dev": true, + "license": "MIT" + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "dev": true, + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/glob/node_modules/brace-expansion": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", + "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/glob/node_modules/minimatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", + "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/handlebars": { + "version": "4.7.8", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz", + "integrity": "sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "minimist": "^1.2.5", + "neo-async": "^2.6.2", + "source-map": "^0.6.1", + "wordwrap": "^1.0.0" + }, + "bin": { + "handlebars": "bin/handlebars" + }, + "engines": { + "node": ">=0.4.7" + }, + "optionalDependencies": { + "uglify-js": "^3.1.4" + } + }, + "node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "dev": true, + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/http2-client": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/http2-client/-/http2-client-1.3.5.tgz", + "integrity": "sha512-EC2utToWl4RKfs5zd36Mxq7nzHHBuomZboI0yYL6Y0RmBgT7Sgkq4rQ0ezFTYoIsSs7Tm9SJe+o2FcAg6GBhGA==", + "dev": true, + "license": "MIT" + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "dev": true, + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", + "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", + "dev": true, + "license": "ISC", + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "license": "MIT", + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "dev": true, + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/is-wsl": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz", + "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jest-diff": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-29.7.0.tgz", + "integrity": "sha512-LMIgiIrhigmPrs03JHpxUh2yISK3vLFPkAodPeo0+BuF7wA2FoQbkEg1u8gBYBThncu7e1oEDUfIXVuTqLRUjw==", + "dev": true, + "license": "MIT", + "dependencies": { + "chalk": "^4.0.0", + "diff-sequences": "^29.6.3", + "jest-get-type": "^29.6.3", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-get-type": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-29.6.3.tgz", + "integrity": "sha512-zrteXnqYxfQh7l5FHyL38jL39di8H8rHoecLH3JNxH3BwOrBsNeabdap5e0I23lD4HHI8W5VFBZqG4Eaq5LNcw==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-matcher-utils": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-29.7.0.tgz", + "integrity": "sha512-sBkD+Xi9DtcChsI3L3u0+N0opgPYnCRPtGcQYrgXmR+hmt/fYfWAL0xRXYU8eWOdfuLgBe0YCW3AFtnRLagq/g==", + "dev": true, + "license": "MIT", + "dependencies": { + "chalk": "^4.0.0", + "jest-diff": "^29.7.0", + "jest-get-type": "^29.6.3", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/js-levenshtein": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/js-levenshtein/-/js-levenshtein-1.1.6.tgz", + "integrity": "sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/jsep": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz", + "integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 10.16.0" + } + }, + "node_modules/json-pointer": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/json-pointer/-/json-pointer-0.6.2.tgz", + "integrity": "sha512-vLWcKbOaXlO+jvRy4qNd+TI1QUPZzfJj1tpJ3vAXDych5XJf93ftpUKe5pKCrzyIIwgBJcOcCVRUfqQP25afBw==", + "dev": true, + "license": "MIT", + "dependencies": { + "foreach": "^2.0.4" + } + }, + "node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, + "node_modules/jsonpath-plus": { + "version": "10.3.0", + "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", + "integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jsep-plugin/assignment": "^1.3.0", + "@jsep-plugin/regex": "^1.0.4", + "jsep": "^1.4.0" + }, + "bin": { + "jsonpath": "bin/jsonpath-cli.js", + "jsonpath-plus": "bin/jsonpath-cli.js" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/jsonpointer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz", + "integrity": "sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/leven": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", + "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, + "node_modules/lunr": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", + "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", + "dev": true, + "license": "MIT" + }, + "node_modules/mark.js": { + "version": "8.11.1", + "resolved": "https://registry.npmjs.org/mark.js/-/mark.js-8.11.1.tgz", + "integrity": "sha512-1I+1qpDt4idfgLQG+BNWmrqku+7/2bi5nLf4YwF8y8zXvmfiTBY3PV3ZibfrjBueCByROpuBjLLFCajqkgYoLQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/marked": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", + "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", + "dev": true, + "license": "MIT", + "bin": { + "marked": "bin/marked.js" + }, + "engines": { + "node": ">= 12" + } + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/minimatch": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz", + "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mobx": { + "version": "6.13.7", + "resolved": "https://registry.npmjs.org/mobx/-/mobx-6.13.7.tgz", + "integrity": "sha512-aChaVU/DO5aRPmk1GX8L+whocagUUpBQqoPtJk+cm7UOXUk87J4PeWCh6nNmTTIfEhiR9DI/+FnA8dln/hTK7g==", + "dev": true, + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mobx" + } + }, + "node_modules/mobx-react": { + "version": "9.2.0", + "resolved": "https://registry.npmjs.org/mobx-react/-/mobx-react-9.2.0.tgz", + "integrity": "sha512-dkGWCx+S0/1mfiuFfHRH8D9cplmwhxOV5CkXMp38u6rQGG2Pv3FWYztS0M7ncR6TyPRQKaTG/pnitInoYE9Vrw==", + "dev": true, + "license": "MIT", + "dependencies": { + "mobx-react-lite": "^4.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mobx" + }, + "peerDependencies": { + "mobx": "^6.9.0", + "react": "^16.8.0 || ^17 || ^18 || ^19" + }, + "peerDependenciesMeta": { + "react-dom": { + "optional": true + }, + "react-native": { + "optional": true + } + } + }, + "node_modules/mobx-react-lite": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/mobx-react-lite/-/mobx-react-lite-4.1.0.tgz", + "integrity": "sha512-QEP10dpHHBeQNv1pks3WnHRCem2Zp636lq54M2nKO2Sarr13pL4u6diQXf65yzXUn0mkk18SyIDCm9UOJYTi1w==", + "dev": true, + "license": "MIT", + "dependencies": { + "use-sync-external-store": "^1.4.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mobx" + }, + "peerDependencies": { + "mobx": "^6.9.0", + "react": "^16.8.0 || ^17 || ^18 || ^19" + }, + "peerDependenciesMeta": { + "react-dom": { + "optional": true + }, + "react-native": { + "optional": true + } + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/neo-async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==", + "dev": true, + "license": "MIT" + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch-h2": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/node-fetch-h2/-/node-fetch-h2-2.3.0.tgz", + "integrity": "sha512-ofRW94Ab0T4AOh5Fk8t0h8OBWrmjb0SSB20xh1H8YnPV9EJ+f5AMoYSUQ2zgJ4Iq2HAK0I2l5/Nequ8YzFS3Hg==", + "dev": true, + "license": "MIT", + "dependencies": { + "http2-client": "^1.2.5" + }, + "engines": { + "node": "4.x || >=6.0.0" + } + }, + "node_modules/node-readfiles": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/node-readfiles/-/node-readfiles-0.2.0.tgz", + "integrity": "sha512-SU00ZarexNlE4Rjdm83vglt5Y9yiQ+XI1XpflWlb7q7UTN1JUItm69xMeiQCTxtTfnzt+83T8Cx+vI2ED++VDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es6-promise": "^3.2.1" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/oas-kit-common": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/oas-kit-common/-/oas-kit-common-1.0.8.tgz", + "integrity": "sha512-pJTS2+T0oGIwgjGpw7sIRU8RQMcUoKCDWFLdBqKB2BNmGpbBMH2sdqAaOXUg8OzonZHU0L7vfJu1mJFEiYDWOQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "fast-safe-stringify": "^2.0.7" + } + }, + "node_modules/oas-linter": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/oas-linter/-/oas-linter-3.2.2.tgz", + "integrity": "sha512-KEGjPDVoU5K6swgo9hJVA/qYGlwfbFx+Kg2QB/kd7rzV5N8N5Mg6PlsoCMohVnQmo+pzJap/F610qTodKzecGQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "@exodus/schemasafe": "^1.0.0-rc.2", + "should": "^13.2.1", + "yaml": "^1.10.0" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-resolver": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/oas-resolver/-/oas-resolver-2.5.6.tgz", + "integrity": "sha512-Yx5PWQNZomfEhPPOphFbZKi9W93CocQj18NlD2Pa4GWZzdZpSJvYwoiuurRI7m3SpcChrnO08hkuQDL3FGsVFQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "node-fetch-h2": "^2.3.0", + "oas-kit-common": "^1.0.8", + "reftools": "^1.1.9", + "yaml": "^1.10.0", + "yargs": "^17.0.1" + }, + "bin": { + "resolve": "resolve.js" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-schema-walker": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/oas-schema-walker/-/oas-schema-walker-1.1.5.tgz", + "integrity": "sha512-2yucenq1a9YPmeNExoUa9Qwrt9RFkjqaMAA1X+U7sbb0AqBeTIdMHky9SQQ6iN94bO5NW0W4TRYXerG+BdAvAQ==", + "dev": true, + "license": "BSD-3-Clause", + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-validator": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/oas-validator/-/oas-validator-5.0.8.tgz", + "integrity": "sha512-cu20/HE5N5HKqVygs3dt94eYJfBi0TsZvPVXDhbXQHiEityDN+RROTleefoKRKKJ9dFAF2JBkDHgvWj0sjKGmw==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "call-me-maybe": "^1.0.1", + "oas-kit-common": "^1.0.8", + "oas-linter": "^3.2.2", + "oas-resolver": "^2.5.6", + "oas-schema-walker": "^1.1.5", + "reftools": "^1.1.9", + "should": "^13.2.1", + "yaml": "^1.10.0" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "dev": true, + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openapi-sampler": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/openapi-sampler/-/openapi-sampler-1.6.1.tgz", + "integrity": "sha512-s1cIatOqrrhSj2tmJ4abFYZQK6l5v+V4toO5q1Pa0DyN8mtyqy2I+Qrj5W9vOELEtybIMQs/TBZGVO/DtTFK8w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/json-schema": "^7.0.7", + "fast-xml-parser": "^4.5.0", + "json-pointer": "0.6.2" + } + }, + "node_modules/outdent": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/outdent/-/outdent-0.8.0.tgz", + "integrity": "sha512-KiOAIsdpUTcAXuykya5fnVVT+/5uS0Q1mrkRHcF89tpieSmY33O/tmc54CqwA+bfhbtEfZUNLHaPUiB9X3jt1A==", + "dev": true, + "license": "MIT" + }, + "node_modules/path-browserify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", + "integrity": "sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g==", + "dev": true, + "license": "MIT" + }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/perfect-scrollbar": { + "version": "1.5.6", + "resolved": "https://registry.npmjs.org/perfect-scrollbar/-/perfect-scrollbar-1.5.6.tgz", + "integrity": "sha512-rixgxw3SxyJbCaSpo1n35A/fwI1r2rdwMKOTCg/AcG+xOEyZcE8UHVjpZMFCVImzsFoCZeJTT+M/rdEIQYO2nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/pluralize": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/pluralize/-/pluralize-8.0.0.tgz", + "integrity": "sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/polished": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/polished/-/polished-4.3.1.tgz", + "integrity": "sha512-OBatVyC/N7SCW/FaDHrSd+vn0o5cS855TOmYi4OkdWUMSJCET/xip//ch8xGUvtr3i44X9LVyWwQlRMTN3pwSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.17.8" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/pretty-format/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/prismjs": { + "version": "1.30.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz", + "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/prop-types": { + "version": "15.8.1", + "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", + "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==", + "dev": true, + "license": "MIT", + "dependencies": { + "loose-envify": "^1.4.0", + "object-assign": "^4.1.1", + "react-is": "^16.13.1" + } + }, + "node_modules/prop-types/node_modules/react-is": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", + "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/protobufjs": { + "version": "7.5.3", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.3.tgz", + "integrity": "sha512-sildjKwVqOI2kmFDiXQ6aEB0fjYTafpEvIBs8tOR8qI4spuL9OPROLVu2qZqi/xgCfsHIwVqlaF8JBjWFHnKbw==", + "dev": true, + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/randombytes": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", + "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "^5.1.0" + } + }, + "node_modules/react": { + "version": "19.1.0", + "resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz", + "integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "19.1.0", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz", + "integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==", + "dev": true, + "license": "MIT", + "dependencies": { + "scheduler": "^0.26.0" + }, + "peerDependencies": { + "react": "^19.1.0" + } + }, + "node_modules/react-is": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz", + "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==", + "dev": true, + "license": "MIT" + }, + "node_modules/react-tabs": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/react-tabs/-/react-tabs-6.1.0.tgz", + "integrity": "sha512-6QtbTRDKM+jA/MZTTefvigNxo0zz+gnBTVFw2CFVvq+f2BuH0nF0vDLNClL045nuTAdOoK/IL1vTP0ZLX0DAyQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "clsx": "^2.0.0", + "prop-types": "^15.5.0" + }, + "peerDependencies": { + "react": "^18.0.0 || ^19.0.0" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dev": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/redoc": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/redoc/-/redoc-2.5.0.tgz", + "integrity": "sha512-NpYsOZ1PD9qFdjbLVBZJWptqE+4Y6TkUuvEOqPUmoH7AKOmPcE+hYjotLxQNTqVoWL4z0T2uxILmcc8JGDci+Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@redocly/openapi-core": "^1.4.0", + "classnames": "^2.3.2", + "decko": "^1.2.0", + "dompurify": "^3.2.4", + "eventemitter3": "^5.0.1", + "json-pointer": "^0.6.2", + "lunr": "^2.3.9", + "mark.js": "^8.11.1", + "marked": "^4.3.0", + "mobx-react": "^9.1.1", + "openapi-sampler": "^1.5.0", + "path-browserify": "^1.0.1", + "perfect-scrollbar": "^1.5.5", + "polished": "^4.2.2", + "prismjs": "^1.29.0", + "prop-types": "^15.8.1", + "react-tabs": "^6.0.2", + "slugify": "~1.4.7", + "stickyfill": "^1.1.1", + "swagger2openapi": "^7.0.8", + "url-template": "^2.0.8" + }, + "engines": { + "node": ">=6.9", + "npm": ">=3.0.0" + }, + "peerDependencies": { + "core-js": "^3.1.4", + "mobx": "^6.0.4", + "react": "^16.8.4 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.4 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "styled-components": "^4.1.1 || ^5.1.1 || ^6.0.5" + } + }, + "node_modules/reftools": { + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/reftools/-/reftools-1.1.9.tgz", + "integrity": "sha512-OVede/NQE13xBQ+ob5CKd5KyeJYU2YInb1bmV4nRoOfquZPkAkxuOXicSe1PvqIuZZ4kD13sPKBbR7UFDmli6w==", + "dev": true, + "license": "BSD-3-Clause", + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/run-applescript": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz", + "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/scheduler": { + "version": "0.26.0", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz", + "integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==", + "dev": true, + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", + "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/set-cookie-parser": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz", + "integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/shallowequal": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/shallowequal/-/shallowequal-1.1.0.tgz", + "integrity": "sha512-y0m1JoUZSlPAjXVtPPW70aZWfIL/dSP7AFkRnniLCrK/8MDKog3TySTBmckD+RObVxH0v4Tox67+F14PdED2oQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/should": { + "version": "13.2.3", + "resolved": "https://registry.npmjs.org/should/-/should-13.2.3.tgz", + "integrity": "sha512-ggLesLtu2xp+ZxI+ysJTmNjh2U0TsC+rQ/pfED9bUZZ4DKefP27D+7YJVVTvKsmjLpIi9jAa7itwDGkDDmt1GQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "should-equal": "^2.0.0", + "should-format": "^3.0.3", + "should-type": "^1.4.0", + "should-type-adaptors": "^1.0.1", + "should-util": "^1.0.0" + } + }, + "node_modules/should-equal": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/should-equal/-/should-equal-2.0.0.tgz", + "integrity": "sha512-ZP36TMrK9euEuWQYBig9W55WPC7uo37qzAEmbjHz4gfyuXrEUgF8cUvQVO+w+d3OMfPvSRQJ22lSm8MQJ43LTA==", + "dev": true, + "license": "MIT", + "dependencies": { + "should-type": "^1.4.0" + } + }, + "node_modules/should-format": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/should-format/-/should-format-3.0.3.tgz", + "integrity": "sha512-hZ58adtulAk0gKtua7QxevgUaXTTXxIi8t41L3zo9AHvjXO1/7sdLECuHeIN2SRtYXpNkmhoUP2pdeWgricQ+Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "should-type": "^1.3.0", + "should-type-adaptors": "^1.0.1" + } + }, + "node_modules/should-type": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/should-type/-/should-type-1.4.0.tgz", + "integrity": "sha512-MdAsTu3n25yDbIe1NeN69G4n6mUnJGtSJHygX3+oN0ZbO3DTiATnf7XnYJdGT42JCXurTb1JI0qOBR65shvhPQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/should-type-adaptors": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/should-type-adaptors/-/should-type-adaptors-1.1.0.tgz", + "integrity": "sha512-JA4hdoLnN+kebEp2Vs8eBe9g7uy0zbRo+RMcU0EsNy+R+k049Ki+N5tT5Jagst2g7EAja+euFuoXFCa8vIklfA==", + "dev": true, + "license": "MIT", + "dependencies": { + "should-type": "^1.3.0", + "should-util": "^1.0.0" + } + }, + "node_modules/should-util": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/should-util/-/should-util-1.0.1.tgz", + "integrity": "sha512-oXF8tfxx5cDk8r2kYqlkUJzZpDBqVY/II2WhvU0n9Y3XYvAYRmeaf1PvvIvTgPnv4KJ+ES5M0PyDq5Jp+Ygy2g==", + "dev": true, + "license": "MIT" + }, + "node_modules/simple-websocket": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz", + "integrity": "sha512-8MJPnjRN6A8UCp1I+H/dSFyjwJhp6wta4hsVRhjf8w9qBHRzxYt14RaOcjvQnhD1N4yKOddEjflwMnQM4VtXjQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "debug": "^4.3.1", + "queue-microtask": "^1.2.2", + "randombytes": "^2.1.0", + "readable-stream": "^3.6.0", + "ws": "^7.4.2" + } + }, + "node_modules/slugify": { + "version": "1.4.7", + "resolved": "https://registry.npmjs.org/slugify/-/slugify-1.4.7.tgz", + "integrity": "sha512-tf+h5W1IrjNm/9rKKj0JU2MDMruiopx0jjVA5zCdBtcGjfp0+c5rHw/zADLC3IeKlGHtVbHtpfzvYA0OYT+HKg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/stickyfill": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/stickyfill/-/stickyfill-1.1.1.tgz", + "integrity": "sha512-GCp7vHAfpao+Qh/3Flh9DXEJ/qSi0KJwJw6zYlZOtRYXWUIpMM6mC2rIep/dK8RQqwW0KxGJIllmjPIBOGN8AA==", + "dev": true + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dev": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strnum": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.1.2.tgz", + "integrity": "sha512-vrN+B7DBIoTTZjnPNewwhx6cBA/H+IS7rfW68n7XxC1y7uoiGQBxaKzqucGUgavX15dJgiGztLJ8vxuEzwqBdA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT" + }, + "node_modules/styled-components": { + "version": "6.1.19", + "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.19.tgz", + "integrity": "sha512-1v/e3Dl1BknC37cXMhwGomhO8AkYmN41CqyX9xhUDxry1ns3BFQy2lLDRQXJRdVVWB9OHemv/53xaStimvWyuA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@emotion/is-prop-valid": "1.2.2", + "@emotion/unitless": "0.8.1", + "@types/stylis": "4.2.5", + "css-to-react-native": "3.2.0", + "csstype": "3.1.3", + "postcss": "8.4.49", + "shallowequal": "1.1.0", + "stylis": "4.3.2", + "tslib": "2.6.2" + }, + "engines": { + "node": ">= 16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/styled-components" + }, + "peerDependencies": { + "react": ">= 16.8.0", + "react-dom": ">= 16.8.0" + } + }, + "node_modules/stylis": { + "version": "4.3.2", + "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.3.2.tgz", + "integrity": "sha512-bhtUjWd/z6ltJiQwg0dUfxEJ+W+jdqQd8TbWLWyeIJHlnsqmGLRFFd8e5mA0AZi/zx90smXRlN66YMTcaSFifg==", + "dev": true, + "license": "MIT" + }, + "node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "license": "MIT", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/swagger2openapi": { + "version": "7.0.8", + "resolved": "https://registry.npmjs.org/swagger2openapi/-/swagger2openapi-7.0.8.tgz", + "integrity": "sha512-upi/0ZGkYgEcLeGieoz8gT74oWHA0E7JivX7aN9mAf+Tc7BQoRBvnIGHoPDw+f9TXTW4s6kGYCZJtauP6OYp7g==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "call-me-maybe": "^1.0.1", + "node-fetch": "^2.6.1", + "node-fetch-h2": "^2.3.0", + "node-readfiles": "^0.2.0", + "oas-kit-common": "^1.0.8", + "oas-resolver": "^2.5.6", + "oas-schema-walker": "^1.1.5", + "oas-validator": "^5.0.8", + "reftools": "^1.1.9", + "yaml": "^1.10.0", + "yargs": "^17.0.1" + }, + "bin": { + "boast": "boast.js", + "oas-validate": "oas-validate.js", + "swagger2openapi": "swagger2openapi.js" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "dev": true, + "license": "MIT" + }, + "node_modules/tslib": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", + "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==", + "dev": true, + "license": "0BSD" + }, + "node_modules/typedarray": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", + "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==", + "dev": true, + "license": "MIT" + }, + "node_modules/uglify-js": { + "version": "3.19.3", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.19.3.tgz", + "integrity": "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ==", + "dev": true, + "license": "BSD-2-Clause", + "optional": true, + "bin": { + "uglifyjs": "bin/uglifyjs" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/undici": { + "version": "6.21.3", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz", + "integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, + "node_modules/undici-types": { + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", + "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==", + "dev": true, + "license": "MIT" + }, + "node_modules/uri-js-replace": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/uri-js-replace/-/uri-js-replace-1.0.1.tgz", + "integrity": "sha512-W+C9NWNLFOoBI2QWDp4UT9pv65r2w5Cx+3sTYFvtMdDBxkKt1syCqsUdSFAChbEe1uK5TfS04wt/nGwmaeIQ0g==", + "dev": true, + "license": "MIT" + }, + "node_modules/url-template": { + "version": "2.0.8", + "resolved": "https://registry.npmjs.org/url-template/-/url-template-2.0.8.tgz", + "integrity": "sha512-XdVKMF4SJ0nP/O7XIPB0JwAEuT9lDIYnNsK8yGVe43y0AWoKeJNdv3ZNWh7ksJ6KqQFjOO6ox/VEitLnaVNufw==", + "dev": true, + "license": "BSD" + }, + "node_modules/use-sync-external-store": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.5.0.tgz", + "integrity": "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "dev": true, + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/wrap-ansi": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/ws": { + "version": "7.5.10", + "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.10.tgz", + "integrity": "sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8.3.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": "^5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=10" + } + }, + "node_modules/yaml": { + "version": "1.10.2", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-1.10.2.tgz", + "integrity": "sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">= 6" + } + }, + "node_modules/yaml-ast-parser": { + "version": "0.0.43", + "resolved": "https://registry.npmjs.org/yaml-ast-parser/-/yaml-ast-parser-0.0.43.tgz", + "integrity": "sha512-2PTINUwsRqSd+s8XxKaJWQlUuEMHJQyEuh2edBbW8KNJz0SJPwUSD2zRWqezFEdN7IzAgeuYHFUCF7o8zRdZ0A==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/yargs": { + "version": "17.0.1", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.0.1.tgz", + "integrity": "sha512-xBBulfCc8Y6gLFcrPvtqKz9hz8SO0l1Ni8GgDekvBX2ro0HRQImDGnikfc33cgzcYUSncapnNcZDjVFIH3f6KQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "cliui": "^7.0.2", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "require-directory": "^2.1.1", + "string-width": "^4.2.0", + "y18n": "^5.0.5", + "yargs-parser": "^20.2.2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs-parser": { + "version": "20.2.9", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.9.tgz", + "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=10" + } + } + } +} diff --git a/build-tools/package.json b/build-tools/package.json new file mode 100644 index 0000000000..000969c672 --- /dev/null +++ b/build-tools/package.json @@ -0,0 +1,8 @@ +{ + "name": "build-tools", + "private": true, + "devDependencies": { + "@redocly/cli": "1.34.4", + "@sourcemeta/jsonschema": "10.0.0" + } +} diff --git a/compute/Makefile b/compute/Makefile index ef2e55f7b1..25bbb30d3a 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -50,9 +50,9 @@ jsonnetfmt-format: jsonnetfmt --in-place $(jsonnet_files) .PHONY: manifest-schema-validation -manifest-schema-validation: node_modules - node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml +manifest-schema-validation: ../build-tools/node_modules + npx --prefix=../build-tools/ jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml -node_modules: package.json - npm install - touch node_modules +../build-tools/node_modules: ../build-tools/package.json + cd ../build-tools && $(if $(CI),npm ci,npm install) + touch ../build-tools/node_modules diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 232b1e3bd5..a658738d76 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -170,7 +170,29 @@ RUN case $DEBIAN_VERSION in \ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION:?} postgres +COPY compute/patches/postgres_fdw.patch . +COPY compute/patches/pg_stat_statements_pg14-16.patch . +COPY compute/patches/pg_stat_statements_pg17.patch . RUN cd postgres && \ + # Apply patches to some contrib extensions + # For example, we need to grant EXECUTE on pg_stat_statements_reset() to {privileged_role_name}. + # In vanilla Postgres this function is limited to Postgres role superuser. + # In Neon we have {privileged_role_name} role that is not a superuser but replaces superuser in some cases. + # We could add the additional grant statements to the Postgres repository but it would be hard to maintain, + # whenever we need to pick up a new Postgres version and we want to limit the changes in our Postgres fork, + # so we do it here. + case "${PG_VERSION}" in \ + "v14" | "v15" | "v16") \ + patch -p1 < /pg_stat_statements_pg14-16.patch; \ + ;; \ + "v17") \ + patch -p1 < /pg_stat_statements_pg17.patch; \ + ;; \ + *) \ + # To do not forget to migrate patches to the next major version + echo "No contrib patches for this PostgreSQL version" && exit 1;; \ + esac && \ + patch -p1 < /postgres_fdw.patch && \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ if [ "${PG_VERSION:?}" != "v14" ]; then \ @@ -184,8 +206,6 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \ - file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \ - echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ @@ -195,34 +215,7 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \ - # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. - # In vanilla postgres this function is limited to Postgres role superuser. - # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. - # We could add the additional grant statements to the postgres repository but it would be hard to maintain, - # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, - # so we do it here. - for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ - filename=$(basename "$file"); \ - # Note that there are no downgrade scripts for pg_stat_statements, so we \ - # don't have to modify any downgrade paths or (much) older versions: we only \ - # have to make sure every creation of the pg_stat_statements_reset function \ - # also adds execute permissions to the neon_superuser. - case $filename in \ - pg_stat_statements--1.4.sql) \ - # pg_stat_statements_reset is first created with 1.4 - echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ - ;; \ - pg_stat_statements--1.6--1.7.sql) \ - # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back - echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ - ;; \ - pg_stat_statements--1.10--1.11.sql) \ - # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back - echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \ - ;; \ - esac; \ - done; + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control # Set PATH for all the subsequent build steps ENV PATH="/usr/local/pgsql/bin:$PATH" @@ -1524,7 +1517,7 @@ WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules -# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# allow {privileged_role_name} to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() # - access to duckdb.extensions table and its sequence RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ diff --git a/compute/package.json b/compute/package.json deleted file mode 100644 index 581384dc13..0000000000 --- a/compute/package.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "name": "neon-compute", - "private": true, - "dependencies": { - "@sourcemeta/jsonschema": "9.3.4" - } -} \ No newline at end of file diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch index 4faf927e39..ba9d7a8fe6 100644 --- a/compute/patches/anon_v2.patch +++ b/compute/patches/anon_v2.patch @@ -1,22 +1,26 @@ diff --git a/sql/anon.sql b/sql/anon.sql -index 0cdc769..b450327 100644 +index 0cdc769..5eab1d6 100644 --- a/sql/anon.sql +++ b/sql/anon.sql -@@ -1141,3 +1141,15 @@ $$ +@@ -1141,3 +1141,19 @@ $$ -- TODO : https://en.wikipedia.org/wiki/L-diversity -- TODO : https://en.wikipedia.org/wiki/T-closeness + +-- NEON Patches + -+GRANT ALL ON SCHEMA anon to neon_superuser; -+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; -+ +DO $$ ++DECLARE ++ privileged_role_name text; +BEGIN -+ IF current_setting('server_version_num')::int >= 150000 THEN -+ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser; -+ END IF; ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT ALL ON SCHEMA anon to %I', privileged_role_name); ++ EXECUTE format('GRANT ALL ON ALL TABLES IN SCHEMA anon TO %I', privileged_role_name); ++ ++ IF current_setting('server_version_num')::int >= 150000 THEN ++ EXECUTE format('GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO %I', privileged_role_name); ++ END IF; +END $$; diff --git a/sql/init.sql b/sql/init.sql index 7da6553..9b6164b 100644 diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch index edc7fbf69d..f7aa374116 100644 --- a/compute/patches/pg_duckdb_v031.patch +++ b/compute/patches/pg_duckdb_v031.patch @@ -21,13 +21,21 @@ index 3235cc8..6b892bc 100644 include Makefile.global diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql -index d777d76..af60106 100644 +index d777d76..3b54396 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql -@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; +@@ -1056,3 +1056,14 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; -+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser; -+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser; -+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO %I', privileged_role_name); ++ EXECUTE format('GRANT ALL ON TABLE duckdb.extensions TO %I', privileged_role_name); ++ EXECUTE format('GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO %I', privileged_role_name); ++END $$; diff --git a/compute/patches/pg_stat_statements_pg14-16.patch b/compute/patches/pg_stat_statements_pg14-16.patch new file mode 100644 index 0000000000..368c6791c7 --- /dev/null +++ b/compute/patches/pg_stat_statements_pg14-16.patch @@ -0,0 +1,34 @@ +diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +index 58cdf600fce..8be57a996f6 100644 +--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql ++++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC; + + -- Don't want this to be available to non-superusers. + REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name); ++END $$; +diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +index 6fc3fed4c93..256345a8f79 100644 +--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql ++++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE; + + -- Don't want this to be available to non-superusers. + REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name); ++END $$; diff --git a/compute/patches/pg_stat_statements_pg17.patch b/compute/patches/pg_stat_statements_pg17.patch new file mode 100644 index 0000000000..ff63b3255c --- /dev/null +++ b/compute/patches/pg_stat_statements_pg17.patch @@ -0,0 +1,52 @@ +diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql +index 0bb2c397711..32764db1d8b 100644 +--- a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql ++++ b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql +@@ -80,3 +80,12 @@ LANGUAGE C STRICT PARALLEL SAFE; + + -- Don't want this to be available to non-superusers. + REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) FROM PUBLIC; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO %I', privileged_role_name); ++END $$; +\ No newline at end of file +diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +index 58cdf600fce..8be57a996f6 100644 +--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql ++++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql +@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC; + + -- Don't want this to be available to non-superusers. + REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name); ++END $$; +diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +index 6fc3fed4c93..256345a8f79 100644 +--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql ++++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql +@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE; + + -- Don't want this to be available to non-superusers. + REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name); ++END $$; diff --git a/compute/patches/postgres_fdw.patch b/compute/patches/postgres_fdw.patch new file mode 100644 index 0000000000..d0007ffea5 --- /dev/null +++ b/compute/patches/postgres_fdw.patch @@ -0,0 +1,17 @@ +diff --git a/contrib/postgres_fdw/postgres_fdw--1.0.sql b/contrib/postgres_fdw/postgres_fdw--1.0.sql +index a0f0fc1bf45..ee077f2eea6 100644 +--- a/contrib/postgres_fdw/postgres_fdw--1.0.sql ++++ b/contrib/postgres_fdw/postgres_fdw--1.0.sql +@@ -16,3 +16,12 @@ LANGUAGE C STRICT; + CREATE FOREIGN DATA WRAPPER postgres_fdw + HANDLER postgres_fdw_handler + VALIDATOR postgres_fdw_validator; ++ ++DO $$ ++DECLARE ++ privileged_role_name text; ++BEGIN ++ privileged_role_name := current_setting('neon.privileged_role_name'); ++ ++ EXECUTE format('GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO %I', privileged_role_name); ++END $$; diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 910bae3bda..496471acc7 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,7 +27,10 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +http-body-util.workspace = true hostname-validator = "1.1" +hyper.workspace = true +hyper-util.workspace = true indexmap.workspace = true itertools.workspace = true jsonwebtoken.workspace = true @@ -44,6 +47,7 @@ postgres.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } ring = "0.17" +scopeguard.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index db7746b8eb..04723d6f3d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -87,6 +87,14 @@ struct Cli { #[arg(short = 'C', long, value_name = "DATABASE_URL")] pub connstr: String, + #[arg( + long, + default_value = "neon_superuser", + value_name = "PRIVILEGED_ROLE_NAME", + value_parser = Self::parse_privileged_role_name + )] + pub privileged_role_name: String, + #[cfg(target_os = "linux")] #[arg(long, default_value = "neon-postgres")] pub cgroup: String, @@ -130,6 +138,12 @@ struct Cli { /// Run in development mode, skipping VM-specific operations like process termination #[arg(long, action = clap::ArgAction::SetTrue)] pub dev: bool, + + #[arg(long)] + pub pg_init_timeout: Option, + + #[arg(long, default_value_t = false, action = clap::ArgAction::Set)] + pub lakebase_mode: bool, } impl Cli { @@ -149,6 +163,21 @@ impl Cli { Ok(url) } + + /// For simplicity, we do not escape `privileged_role_name` anywhere in the code. + /// Since it's a system role, which we fully control, that's fine. Still, let's + /// validate it to avoid any surprises. + fn parse_privileged_role_name(value: &str) -> Result { + use regex::Regex; + + let pattern = Regex::new(r"^[a-z_]+$").unwrap(); + + if !pattern.is_match(value) { + bail!("--privileged-role-name can only contain lowercase letters and underscores") + } + + Ok(value.to_string()) + } } fn main() -> Result<()> { @@ -165,7 +194,7 @@ fn main() -> Result<()> { .build()?; let _rt_guard = runtime.enter(); - runtime.block_on(init(cli.dev))?; + let tracing_provider = init(cli.dev)?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -178,6 +207,7 @@ fn main() -> Result<()> { ComputeNodeParams { compute_id: cli.compute_id, connstr, + privileged_role_name: cli.privileged_role_name.clone(), pgdata: cli.pgdata.clone(), pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), @@ -195,6 +225,8 @@ fn main() -> Result<()> { installed_extensions_collection_interval: Arc::new(AtomicU64::new( cli.installed_extensions_collection_interval, )), + pg_init_timeout: cli.pg_init_timeout.map(Duration::from_secs), + lakebase_mode: cli.lakebase_mode, }, config, )?; @@ -203,11 +235,11 @@ fn main() -> Result<()> { scenario.teardown(); - deinit_and_exit(exit_code); + deinit_and_exit(tracing_provider, exit_code); } -async fn init(dev_mode: bool) -> Result<()> { - init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; +fn init(dev_mode: bool) -> Result> { + let provider = init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { @@ -218,7 +250,7 @@ async fn init(dev_mode: bool) -> Result<()> { info!("compute build_tag: {}", &BUILD_TAG.to_string()); - Ok(()) + Ok(provider) } fn get_config(cli: &Cli) -> Result { @@ -243,25 +275,27 @@ fn get_config(cli: &Cli) -> Result { } } -fn deinit_and_exit(exit_code: Option) -> ! { - // Shutdown trace pipeline gracefully, so that it has a chance to send any - // pending traces before we exit. Shutting down OTEL tracing provider may - // hang for quite some time, see, for example: - // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 - // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 - // - // Yet, we want computes to shut down fast enough, as we may need a new one - // for the same timeline ASAP. So wait no longer than 2s for the shutdown to - // complete, then just error out and exit the main thread. - info!("shutting down tracing"); - let (sender, receiver) = mpsc::channel(); - let _ = thread::spawn(move || { - tracing_utils::shutdown_tracing(); - sender.send(()).ok() - }); - let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); - if shutdown_res.is_err() { - error!("timed out while shutting down tracing, exiting anyway"); +fn deinit_and_exit(tracing_provider: Option, exit_code: Option) -> ! { + if let Some(p) = tracing_provider { + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. Shutting down OTEL tracing provider may + // hang for quite some time, see, for example: + // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 + // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 + // + // Yet, we want computes to shut down fast enough, as we may need a new one + // for the same timeline ASAP. So wait no longer than 2s for the shutdown to + // complete, then just error out and exit the main thread. + info!("shutting down tracing"); + let (sender, receiver) = mpsc::channel(); + let _ = thread::spawn(move || { + _ = p.shutdown(); + sender.send(()).ok() + }); + let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); + if shutdown_res.is_err() { + error!("timed out while shutting down tracing, exiting anyway"); + } } info!("shutting down"); @@ -327,4 +361,49 @@ mod test { ]) .expect_err("URL parameters are not allowed"); } + + #[test] + fn verify_privileged_role_name() { + // Valid name + let cli = Cli::parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--privileged-role-name", + "my_superuser", + ]); + assert_eq!(cli.privileged_role_name, "my_superuser"); + + // Invalid names + Cli::try_parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--privileged-role-name", + "NeonSuperuser", + ]) + .expect_err("uppercase letters are not allowed"); + + Cli::try_parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--privileged-role-name", + "$'neon_superuser", + ]) + .expect_err("special characters are not allowed"); + + Cli::try_parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--privileged-role-name", + "", + ]) + .expect_err("empty name is not allowed"); + } } diff --git a/compute_tools/src/communicator_socket_client.rs b/compute_tools/src/communicator_socket_client.rs new file mode 100644 index 0000000000..806e0a21e3 --- /dev/null +++ b/compute_tools/src/communicator_socket_client.rs @@ -0,0 +1,98 @@ +//! Client for making request to a running Postgres server's communicator control socket. +//! +//! The storage communicator process that runs inside Postgres exposes an HTTP endpoint in +//! a Unix Domain Socket in the Postgres data directory. This provides access to it. + +use std::path::Path; + +use anyhow::Context; +use hyper::client::conn::http1::SendRequest; +use hyper_util::rt::TokioIo; + +/// Name of the socket within the Postgres data directory. This better match that in +/// `pgxn/neon/communicator/src/lib.rs`. +const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket"; + +/// Open a connection to the communicator's control socket, prepare to send requests to it +/// with hyper. +pub async fn connect_communicator_socket(pgdata: &Path) -> anyhow::Result> +where + B: hyper::body::Body + 'static + Send, + B::Data: Send, + B::Error: Into>, +{ + let socket_path = pgdata.join(NEON_COMMUNICATOR_SOCKET_NAME); + let socket_path_len = socket_path.display().to_string().len(); + + // There is a limit of around 100 bytes (108 on Linux?) on the length of the path to a + // Unix Domain socket. The limit is on the connect(2) function used to open the + // socket, not on the absolute path itself. Postgres changes the current directory to + // the data directory and uses a relative path to bind to the socket, and the relative + // path "./neon-communicator.socket" is always short, but when compute_ctl needs to + // open the socket, we need to use a full path, which can be arbitrarily long. + // + // There are a few ways we could work around this: + // + // 1. Change the current directory to the Postgres data directory and use a relative + // path in the connect(2) call. That's problematic because the current directory + // applies to the whole process. We could change the current directory early in + // compute_ctl startup, and that might be a good idea anyway for other reasons too: + // it would be more robust if the data directory is moved around or unlinked for + // some reason, and you would be less likely to accidentally litter other parts of + // the filesystem with e.g. temporary files. However, that's a pretty invasive + // change. + // + // 2. On Linux, you could open() the data directory, and refer to the the socket + // inside it as "/proc/self/fd//neon-communicator.socket". But that's + // Linux-only. + // + // 3. Create a symbolic link to the socket with a shorter path, and use that. + // + // We use the symbolic link approach here. Hopefully the paths we use in production + // are shorter, so that we can open the socket directly, so that this hack is needed + // only in development. + let connect_result = if socket_path_len < 100 { + // We can open the path directly with no hacks. + tokio::net::UnixStream::connect(socket_path).await + } else { + // The path to the socket is too long. Create a symlink to it with a shorter path. + let short_path = std::env::temp_dir().join(format!( + "compute_ctl.short-socket.{}.{}", + std::process::id(), + tokio::task::id() + )); + std::os::unix::fs::symlink(&socket_path, &short_path)?; + + // Delete the symlink as soon as we have connected to it. There's a small chance + // of leaking if the process dies before we remove it, so try to keep that window + // as small as possible. + scopeguard::defer! { + if let Err(err) = std::fs::remove_file(&short_path) { + tracing::warn!("could not remove symlink \"{}\" created for socket: {}", + short_path.display(), err); + } + } + + tracing::info!( + "created symlink \"{}\" for socket \"{}\", opening it now", + short_path.display(), + socket_path.display() + ); + + tokio::net::UnixStream::connect(&short_path).await + }; + + let stream = connect_result.context("connecting to communicator control socket")?; + + let io = TokioIo::new(stream); + let (request_sender, connection) = hyper::client::conn::http1::handshake(io).await?; + + // spawn a task to poll the connection and drive the HTTP state + tokio::spawn(async move { + if let Err(err) = connection.await { + eprintln!("Error in connection: {err}"); + } + }); + + Ok(request_sender) +} diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8f42cf699b..b4d7a6fca9 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -74,12 +74,20 @@ const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600; /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. +#[derive(Clone, Debug)] pub struct ComputeNodeParams { /// The ID of the compute pub compute_id: String, - // Url type maintains proper escaping + + /// Url type maintains proper escaping pub connstr: url::Url, + /// The name of the 'weak' superuser role, which we give to the users. + /// It follows the allow list approach, i.e., we take a standard role + /// and grant it extra permissions with explicit GRANTs here and there, + /// and core patches. + pub privileged_role_name: String, + pub resize_swap_on_bind: bool, pub set_disk_quota_for_fs: Option, @@ -105,6 +113,11 @@ pub struct ComputeNodeParams { /// Interval for installed extensions collection pub installed_extensions_collection_interval: Arc, + + /// Timeout of PG compute startup in the Init state. + pub pg_init_timeout: Option, + + pub lakebase_mode: bool, } type TaskHandle = Mutex>>; @@ -146,6 +159,7 @@ pub struct RemoteExtensionMetrics { #[derive(Clone, Debug)] pub struct ComputeState { pub start_time: DateTime, + pub pg_start_time: Option>, pub status: ComputeStatus, /// Timestamp of the last Postgres activity. It could be `None` if /// compute wasn't used since start. @@ -183,6 +197,7 @@ impl ComputeState { pub fn new() -> Self { Self { start_time: Utc::now(), + pg_start_time: None, status: ComputeStatus::Empty, last_active: None, error: None, @@ -640,6 +655,9 @@ impl ComputeNode { }; _this_entered = start_compute_span.enter(); + // Hadron: Record postgres start time (used to enforce pg_init_timeout). + state_guard.pg_start_time.replace(Utc::now()); + state_guard.set_status(ComputeStatus::Init, &self.state_changed); compute_state = state_guard.clone() } @@ -1286,9 +1304,7 @@ impl ComputeNode { // In case of error, log and fail the check, but don't crash. // We're playing it safe because these errors could be transient - // and we don't yet retry. Also being careful here allows us to - // be backwards compatible with safekeepers that don't have the - // TIMELINE_STATUS API yet. + // and we don't yet retry. if responses.len() < quorum { error!( "failed sync safekeepers check {:?} {:?} {:?}", @@ -1391,6 +1407,7 @@ impl ComputeNode { self.create_pgdata()?; config::write_postgres_conf( pgdata_path, + &self.params, &pspec.spec, self.params.internal_http_port, tls_config, @@ -1434,7 +1451,7 @@ impl ComputeNode { })?; // Update pg_hba.conf received with basebackup. - update_pg_hba(pgdata_path)?; + update_pg_hba(pgdata_path, None)?; // Place pg_dynshmem under /dev/shm. This allows us to use // 'dynamic_shared_memory_type = mmap' so that the files are placed in @@ -1739,6 +1756,8 @@ impl ComputeNode { } // Run migrations separately to not hold up cold starts + let lakebase_mode = self.params.lakebase_mode; + let params = self.params.clone(); tokio::spawn(async move { let mut conf = conf.as_ref().clone(); conf.application_name("compute_ctl:migrations"); @@ -1750,7 +1769,7 @@ impl ComputeNode { eprintln!("connection error: {e}"); } }); - if let Err(e) = handle_migrations(&mut client).await { + if let Err(e) = handle_migrations(params, &mut client, lakebase_mode).await { error!("Failed to run migrations: {}", e); } } @@ -1829,6 +1848,7 @@ impl ComputeNode { let pgdata_path = Path::new(&self.params.pgdata); config::write_postgres_conf( pgdata_path, + &self.params, &spec, self.params.internal_http_port, tls_config, @@ -2441,14 +2461,31 @@ LIMIT 100", pub fn spawn_lfc_offload_task(self: &Arc, interval: Duration) { self.terminate_lfc_offload_task(); let secs = interval.as_secs(); - info!("spawning lfc offload worker with {secs}s interval"); let this = self.clone(); + + info!("spawning LFC offload worker with {secs}s interval"); let handle = spawn(async move { let mut interval = time::interval(interval); interval.tick().await; // returns immediately loop { interval.tick().await; - this.offload_lfc_async().await; + + let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone(); + // Do not offload LFC state if we are currently prewarming or any issue occurred. + // If we'd do that, we might override the LFC state in endpoint storage with some + // incomplete state. Imagine a situation: + // 1. Endpoint started with `autoprewarm: true` + // 2. While prewarming is not completed, we upload the new incomplete state + // 3. Compute gets interrupted and restarts + // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state + if matches!( + prewarm_state, + LfcPrewarmState::Completed + | LfcPrewarmState::NotPrewarmed + | LfcPrewarmState::Skipped + ) { + this.offload_lfc_async().await; + } } }); *self.lfc_offload_task.lock().unwrap() = Some(handle); diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index d014a5bb72..07b4a596cc 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -89,7 +89,7 @@ impl ComputeNode { self.state.lock().unwrap().lfc_offload_state.clone() } - /// If there is a prewarm request ongoing, return false, true otherwise + /// If there is a prewarm request ongoing, return `false`, `true` otherwise. pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { { let state = &mut self.state.lock().unwrap().lfc_prewarm_state; @@ -101,15 +101,25 @@ impl ComputeNode { let cloned = self.clone(); spawn(async move { - let Err(err) = cloned.prewarm_impl(from_endpoint).await else { - cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; - return; - }; - crate::metrics::LFC_PREWARM_ERRORS.inc(); - error!(%err, "prewarming lfc"); - cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { - error: err.to_string(), + let state = match cloned.prewarm_impl(from_endpoint).await { + Ok(true) => LfcPrewarmState::Completed, + Ok(false) => { + info!( + "skipping LFC prewarm because LFC state is not found in endpoint storage" + ); + LfcPrewarmState::Skipped + } + Err(err) => { + crate::metrics::LFC_PREWARM_ERRORS.inc(); + error!(%err, "could not prewarm LFC"); + + LfcPrewarmState::Failed { + error: err.to_string(), + } + } }; + + cloned.state.lock().unwrap().lfc_prewarm_state = state; }); true } @@ -120,15 +130,21 @@ impl ComputeNode { EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint) } - async fn prewarm_impl(&self, from_endpoint: Option) -> Result<()> { + /// Request LFC state from endpoint storage and load corresponding pages into Postgres. + /// Returns a result with `false` if the LFC state is not found in endpoint storage. + async fn prewarm_impl(&self, from_endpoint: Option) -> Result { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; - info!(%url, "requesting LFC state from endpoint storage"); + info!(%url, "requesting LFC state from endpoint storage"); let request = Client::new().get(&url).bearer_auth(token); let res = request.send().await.context("querying endpoint storage")?; let status = res.status(); - if status != StatusCode::OK { - bail!("{status} querying endpoint storage") + match status { + StatusCode::OK => (), + StatusCode::NOT_FOUND => { + return Ok(false); + } + _ => bail!("{status} querying endpoint storage"), } let mut uncompressed = Vec::new(); @@ -141,7 +157,8 @@ impl ComputeNode { .await .context("decoding LFC state")?; let uncompressed_len = uncompressed.len(); - info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres"); + + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres"); ComputeNode::get_maintenance_client(&self.tokio_conn_conf) .await @@ -149,7 +166,9 @@ impl ComputeNode { .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed]) .await .context("loading LFC state into postgres") - .map(|_| ()) + .map(|_| ())?; + + Ok(true) } /// If offload request is ongoing, return false, true otherwise @@ -177,12 +196,14 @@ impl ComputeNode { async fn offload_lfc_with_state_update(&self) { crate::metrics::LFC_OFFLOADS.inc(); + let Err(err) = self.offload_lfc_impl().await else { self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; return; }; + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); - error!(%err, "offloading lfc"); + error!(%err, "could not offload LFC state to endpoint storage"); self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { error: err.to_string(), }; @@ -190,7 +211,7 @@ impl ComputeNode { async fn offload_lfc_impl(&self) -> Result<()> { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; - info!(%url, "requesting LFC state from postgres"); + info!(%url, "requesting LFC state from Postgres"); let mut compressed = Vec::new(); ComputeNode::get_maintenance_client(&self.tokio_conn_conf) @@ -205,13 +226,17 @@ impl ComputeNode { .read_to_end(&mut compressed) .await .context("compressing LFC state")?; + let compressed_len = compressed.len(); info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); let request = Client::new().put(url).bearer_auth(token).body(compressed); match request.send().await { Ok(res) if res.status() == StatusCode::OK => Ok(()), - Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), + Ok(res) => bail!( + "Request to endpoint storage failed with status: {}", + res.status() + ), Err(err) => Err(err).context("writing to endpoint storage"), } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 169de5c963..dd46353343 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -9,6 +9,7 @@ use std::path::Path; use compute_api::responses::TlsConfig; use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; +use crate::compute::ComputeNodeParams; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, }; @@ -41,6 +42,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf( pgdata_path: &Path, + params: &ComputeNodeParams, spec: &ComputeSpec, extension_server_port: u16, tls_config: &Option, @@ -54,14 +56,15 @@ pub fn write_postgres_conf( writeln!(file, "{conf}")?; } + // Stripe size GUC should be defined prior to connection string + if let Some(stripe_size) = spec.shard_stripe_size { + writeln!(file, "neon.stripe_size={stripe_size}")?; + } // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; if let Some(s) = &spec.pageserver_connstring { writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } - if let Some(stripe_size) = spec.shard_stripe_size { - writeln!(file, "neon.stripe_size={stripe_size}")?; - } if !spec.safekeeper_connstrings.is_empty() { let mut neon_safekeepers_value = String::new(); tracing::info!( @@ -161,6 +164,12 @@ pub fn write_postgres_conf( } } + writeln!( + file, + "neon.privileged_role_name={}", + escape_conf_value(params.privileged_role_name.as_str()) + )?; + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; diff --git a/compute_tools/src/hadron_metrics.rs b/compute_tools/src/hadron_metrics.rs new file mode 100644 index 0000000000..17c4e82622 --- /dev/null +++ b/compute_tools/src/hadron_metrics.rs @@ -0,0 +1,60 @@ +use metrics::{ + IntCounter, IntGaugeVec, core::Collector, proto::MetricFamily, register_int_counter, + register_int_gauge_vec, +}; +use once_cell::sync::Lazy; + +// Counter keeping track of the number of PageStream request errors reported by Postgres. +// An error is registered every time Postgres calls compute_ctl's /refresh_configuration API. +// Postgres will invoke this API if it detected trouble with PageStream requests (get_page@lsn, +// get_base_backup, etc.) it sends to any pageserver. An increase in this counter value typically +// indicates Postgres downtime, as PageStream requests are critical for Postgres to function. +pub static POSTGRES_PAGESTREAM_REQUEST_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "pg_cctl_pagestream_request_errors_total", + "Number of PageStream request errors reported by the postgres process" + ) + .expect("failed to define a metric") +}); + +// Counter keeping track of the number of compute configuration errors due to Postgres statement +// timeouts. An error is registered every time `ComputeNode::reconfigure()` fails due to Postgres +// error code 57014 (query cancelled). This statement timeout typically occurs when postgres is +// stuck in a problematic retry loop when the PS is reject its connection requests (usually due +// to PG pointing at the wrong PS). We should investigate the root cause when this counter value +// increases by checking PG and PS logs. +pub static COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "pg_cctl_configure_statement_timeout_errors_total", + "Number of compute configuration errors due to Postgres statement timeouts." + ) + .expect("failed to define a metric") +}); + +pub static COMPUTE_ATTACHED: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pg_cctl_attached", + "Compute node attached status (1 if attached)", + &[ + "pg_compute_id", + "pg_instance_id", + "tenant_id", + "timeline_id" + ] + ) + .expect("failed to define a metric") +}); + +pub fn collect() -> Vec { + let mut metrics = Vec::new(); + metrics.extend(POSTGRES_PAGESTREAM_REQUEST_ERRORS.collect()); + metrics.extend(COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.collect()); + metrics.extend(COMPUTE_ATTACHED.collect()); + metrics +} + +pub fn initialize_metrics() { + Lazy::force(&POSTGRES_PAGESTREAM_REQUEST_ERRORS); + Lazy::force(&COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS); + Lazy::force(&COMPUTE_ATTACHED); +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 93a357e160..3cf5ea7c51 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -613,11 +613,11 @@ components: - skipped properties: status: - description: Lfc prewarm status - enum: [not_prewarmed, prewarming, completed, failed] + description: LFC prewarm status + enum: [not_prewarmed, prewarming, completed, failed, skipped] type: string error: - description: Lfc prewarm error, if any + description: LFC prewarm error, if any type: string total: description: Total pages processed @@ -635,11 +635,11 @@ components: - status properties: status: - description: Lfc offload status + description: LFC offload status enum: [not_offloaded, offloading, completed, failed] type: string error: - description: Lfc offload error, if any + description: LFC offload error, if any type: string PromoteState: diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index da8d8b20a5..96b464fd12 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -1,10 +1,18 @@ +use std::path::Path; +use std::sync::Arc; + +use anyhow::Context; use axum::body::Body; +use axum::extract::State; use axum::response::Response; -use http::StatusCode; use http::header::CONTENT_TYPE; +use http_body_util::BodyExt; +use hyper::{Request, StatusCode}; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; +use crate::communicator_socket_client::connect_communicator_socket; +use crate::compute::ComputeNode; use crate::http::JsonResponse; use crate::metrics::collect; @@ -31,3 +39,42 @@ pub(in crate::http) async fn get_metrics() -> Response { .body(Body::from(buffer)) .unwrap() } + +/// Fetch and forward metrics from the Postgres neon extension's metrics +/// exporter that are used by autoscaling-agent. +/// +/// The neon extension exposes these metrics over a Unix domain socket +/// in the data directory. That's not accessible directly from the outside +/// world, so we have this endpoint in compute_ctl to expose it +pub(in crate::http) async fn get_autoscaling_metrics( + State(compute): State>, +) -> Result { + let pgdata = Path::new(&compute.params.pgdata); + + // Connect to the communicator process's metrics socket + let mut metrics_client = connect_communicator_socket(pgdata) + .await + .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?; + + // Make a request for /autoscaling_metrics + let request = Request::builder() + .method("GET") + .uri("/autoscaling_metrics") + .header("Host", "localhost") // hyper requires Host, even though the server won't care + .body(Body::from("")) + .unwrap(); + let resp = metrics_client + .send_request(request) + .await + .context("fetching metrics from Postgres metrics service") + .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?; + + // Build a response that just forwards the response we got. + let mut response = Response::builder(); + response = response.status(resp.status()); + if let Some(content_type) = resp.headers().get(CONTENT_TYPE) { + response = response.header(CONTENT_TYPE, content_type); + } + let body = tonic::service::AxumBody::from_stream(resp.into_body().into_data_stream()); + Ok(response.body(body).unwrap()) +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 17939e39d4..f0fbca8263 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -81,8 +81,12 @@ impl From<&Server> for Router> { Server::External { config, compute_id, .. } => { - let unauthenticated_router = - Router::>::new().route("/metrics", get(metrics::get_metrics)); + let unauthenticated_router = Router::>::new() + .route("/metrics", get(metrics::get_metrics)) + .route( + "/autoscaling_metrics", + get(metrics::get_autoscaling_metrics), + ); let authenticated_router = Router::>::new() .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 2d5d4565b7..5ffa2f004a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -4,6 +4,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] pub mod checker; +pub mod communicator_socket_client; pub mod config; pub mod configurator; pub mod http; @@ -15,6 +16,7 @@ pub mod compute_prewarm; pub mod compute_promote; pub mod disk_quota; pub mod extension_server; +pub mod hadron_metrics; pub mod installed_extensions; pub mod local_proxy; pub mod lsn_lease; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index c36f302f99..cd076472a6 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -13,7 +13,9 @@ use tracing_subscriber::prelude::*; /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See /// `tracing-utils` package description. /// -pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { +pub fn init_tracing_and_logging( + default_log_level: &str, +) -> anyhow::Result> { // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); @@ -24,8 +26,9 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = - tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await; + let provider = + tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()); + let otlp_layer = provider.as_ref().map(tracing_utils::layer); // Put it all together tracing_subscriber::registry() @@ -37,7 +40,7 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); - Ok(()) + Ok(provider) } /// Replace all newline characters with a special character to make it diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index c5e05822c0..88d870df97 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -9,15 +9,20 @@ use crate::metrics::DB_MIGRATION_FAILED; pub(crate) struct MigrationRunner<'m> { client: &'m mut Client, migrations: &'m [&'m str], + lakebase_mode: bool, } impl<'m> MigrationRunner<'m> { /// Create a new migration runner - pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + pub fn new(client: &'m mut Client, migrations: &'m [&'m str], lakebase_mode: bool) -> Self { // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 assert!(migrations.len() + 1 < i64::MAX as usize); - Self { client, migrations } + Self { + client, + migrations, + lakebase_mode, + } } /// Get the current value neon_migration.migration_id @@ -130,8 +135,13 @@ impl<'m> MigrationRunner<'m> { // ID is also the next index let migration_id = (current_migration + 1) as i64; let migration = self.migrations[current_migration]; + let migration = if self.lakebase_mode { + migration.replace("neon_superuser", "databricks_superuser") + } else { + migration.to_string() + }; - match Self::run_migration(self.client, migration_id, migration).await { + match Self::run_migration(self.client, migration_id, &migration).await { Ok(_) => { info!("Finished migration id={}", migration_id); } diff --git a/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql new file mode 100644 index 0000000000..6443645336 --- /dev/null +++ b/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql @@ -0,0 +1 @@ +ALTER ROLE {privileged_role_name} BYPASSRLS; diff --git a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql deleted file mode 100644 index 73b36a37f6..0000000000 --- a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER ROLE neon_superuser BYPASSRLS; diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql index 8fc371eb8f..367356e6eb 100644 --- a/compute_tools/src/migrations/0002-alter_roles.sql +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -15,7 +15,7 @@ DO $$ DECLARE role_name text; BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') + FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member') LOOP RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; @@ -23,7 +23,7 @@ BEGIN FOR role_name IN SELECT rolname FROM pg_roles WHERE - NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') + NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_') LOOP RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql similarity index 63% rename from compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql index 37f0ce211f..adf159dc06 100644 --- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql @@ -1,6 +1,6 @@ DO $$ BEGIN IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN - EXECUTE 'GRANT pg_create_subscription TO neon_superuser'; + EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql deleted file mode 100644 index 11afd3b635..0000000000 --- a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql +++ /dev/null @@ -1 +0,0 @@ -GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql new file mode 100644 index 0000000000..6a7ed4007f --- /dev/null +++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql @@ -0,0 +1 @@ +GRANT pg_monitor TO {privileged_role_name} WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql similarity index 58% rename from compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql index 8abe052494..c31f99f3cb 100644 --- a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql +++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql @@ -1,4 +1,4 @@ -- SKIP: Deemed insufficient for allowing relations created by extensions to be --- interacted with by neon_superuser without permission issues. +-- interacted with by {privileged_role_name} without permission issues. -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name}; diff --git a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql similarity index 57% rename from compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql index 5bcb026e0c..fadac9ac3b 100644 --- a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql +++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql @@ -1,4 +1,4 @@ -- SKIP: Deemed insufficient for allowing relations created by extensions to be --- interacted with by neon_superuser without permission issues. +-- interacted with by {privileged_role_name} without permission issues. -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name}; diff --git a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql similarity index 73% rename from compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql index ce7c96753e..5caa9b7829 100644 --- a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql +++ b/compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql @@ -1,3 +1,3 @@ -- SKIP: Moved inline to the handle_grants() functions. -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name} WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql similarity index 72% rename from compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql index 72baf920cd..03de0c37ac 100644 --- a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql +++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql @@ -1,3 +1,3 @@ -- SKIP: Moved inline to the handle_grants() functions. -ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name} WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql similarity index 82% rename from compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql rename to compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql index 28750e00dd..84fcb36391 100644 --- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql @@ -1,7 +1,7 @@ DO $$ BEGIN IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN - EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser'; - EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser'; + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}'; + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}'; END IF; END $$; diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql deleted file mode 100644 index 425ed8cd3d..0000000000 --- a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql +++ /dev/null @@ -1 +0,0 @@ -GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser; diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql new file mode 100644 index 0000000000..125a9f463f --- /dev/null +++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql @@ -0,0 +1 @@ +GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO {privileged_role_name}; diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql deleted file mode 100644 index 36e31544be..0000000000 --- a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql +++ /dev/null @@ -1 +0,0 @@ -GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql new file mode 100644 index 0000000000..1b54ec8a3b --- /dev/null +++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql @@ -0,0 +1 @@ +GRANT pg_signal_backend TO {privileged_role_name} WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql rename to compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/tests/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/tests/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql similarity index 100% rename from compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql rename to compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index fa01545856..e164f15dba 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -11,6 +11,7 @@ use tracing::{Level, error, info, instrument, span}; use crate::compute::ComputeNode; use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS}; +const PG_DEFAULT_INIT_TIMEOUIT: Duration = Duration::from_secs(60); const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); /// Struct to store runtime state of the compute monitor thread. @@ -352,13 +353,47 @@ impl ComputeMonitor { // Hang on condition variable waiting until the compute status is `Running`. fn wait_for_postgres_start(compute: &ComputeNode) { let mut state = compute.state.lock().unwrap(); + let pg_init_timeout = compute + .params + .pg_init_timeout + .unwrap_or(PG_DEFAULT_INIT_TIMEOUIT); + while state.status != ComputeStatus::Running { info!("compute is not running, waiting before monitoring activity"); - state = compute.state_changed.wait(state).unwrap(); + if !compute.params.lakebase_mode { + state = compute.state_changed.wait(state).unwrap(); - if state.status == ComputeStatus::Running { - break; + if state.status == ComputeStatus::Running { + break; + } + continue; } + + if state.pg_start_time.is_some() + && Utc::now() + .signed_duration_since(state.pg_start_time.unwrap()) + .to_std() + .unwrap_or_default() + > pg_init_timeout + { + // If Postgres isn't up and running with working PS/SK connections within POSTGRES_STARTUP_TIMEOUT, it is + // possible that we started Postgres with a wrong spec (so it is talking to the wrong PS/SK nodes). To prevent + // deadends we simply exit (panic) the compute node so it can restart with the latest spec. + // + // NB: We skip this check if we have not attempted to start PG yet (indicated by state.pg_start_up == None). + // This is to make sure the more appropriate errors are surfaced if we encounter issues before we even attempt + // to start PG (e.g., if we can't pull the spec, can't sync safekeepers, or can't get the basebackup). + error!( + "compute did not enter Running state in {} seconds, exiting", + pg_init_timeout.as_secs() + ); + std::process::exit(1); + } + state = compute + .state_changed + .wait_timeout(state, Duration::from_secs(5)) + .unwrap() + .0; } } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 0a3ceed2fa..09bbe89b41 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -11,7 +11,9 @@ use std::time::{Duration, Instant}; use anyhow::{Result, bail}; use compute_api::responses::TlsConfig; -use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; +use compute_api::spec::{ + Database, DatabricksSettings, GenericOption, GenericOptions, PgIdent, Role, +}; use futures::StreamExt; use indexmap::IndexMap; use ini::Ini; @@ -184,6 +186,42 @@ impl DatabaseExt for Database { } } +pub trait DatabricksSettingsExt { + fn as_pg_settings(&self) -> String; +} + +impl DatabricksSettingsExt for DatabricksSettings { + fn as_pg_settings(&self) -> String { + // Postgres GUCs rendered from DatabricksSettings + vec![ + // ssl_ca_file + Some(format!( + "ssl_ca_file = '{}'", + self.pg_compute_tls_settings.ca_file + )), + // [Optional] databricks.workspace_url + Some(format!( + "databricks.workspace_url = '{}'", + &self.databricks_workspace_host + )), + // todo(vikas.jain): these are not required anymore as they are moved to static + // conf but keeping these to avoid image mismatch between hcc and pg. + // Once hcc and pg are in sync, we can remove these. + // + // databricks.enable_databricks_identity_login + Some("databricks.enable_databricks_identity_login = true".to_string()), + // databricks.enable_sql_restrictions + Some("databricks.enable_sql_restrictions = true".to_string()), + ] + .into_iter() + // Removes `None`s + .flatten() + .collect::>() + .join("\n") + + "\n" + } +} + /// Generic trait used to provide quoting / encoding for strings used in the /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index b6382b2f56..d00f86a2c0 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,4 +1,6 @@ use std::fs::File; +use std::fs::{self, Permissions}; +use std::os::unix::fs::PermissionsExt; use std::path::Path; use anyhow::{Result, anyhow, bail}; @@ -9,6 +11,7 @@ use reqwest::StatusCode; use tokio_postgres::Client; use tracing::{error, info, instrument}; +use crate::compute::ComputeNodeParams; use crate::config; use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; @@ -132,10 +135,25 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result } /// Check `pg_hba.conf` and update if needed to allow external connections. -pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { +pub fn update_pg_hba(pgdata_path: &Path, databricks_pg_hba: Option<&String>) -> Result<()> { // XXX: consider making it a part of config.json let pghba_path = pgdata_path.join("pg_hba.conf"); + // Update pg_hba to contains databricks specfic settings before adding neon settings + // PG uses the first record that matches to perform authentication, so we need to have + // our rules before the default ones from neon. + // See https://www.postgresql.org/docs/16/auth-pg-hba-conf.html + if let Some(databricks_pg_hba) = databricks_pg_hba { + if config::line_in_file( + &pghba_path, + &format!("include_if_exists {}\n", *databricks_pg_hba), + )? { + info!("updated pg_hba.conf to include databricks_pg_hba.conf"); + } else { + info!("pg_hba.conf already included databricks_pg_hba.conf"); + } + } + if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { info!("updated pg_hba.conf to allow external connections"); } else { @@ -145,6 +163,59 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { Ok(()) } +/// Check `pg_ident.conf` and update if needed to allow databricks config. +pub fn update_pg_ident(pgdata_path: &Path, databricks_pg_ident: Option<&String>) -> Result<()> { + info!("checking pg_ident.conf"); + let pghba_path = pgdata_path.join("pg_ident.conf"); + + // Update pg_ident to contains databricks specfic settings + if let Some(databricks_pg_ident) = databricks_pg_ident { + if config::line_in_file( + &pghba_path, + &format!("include_if_exists {}\n", *databricks_pg_ident), + )? { + info!("updated pg_ident.conf to include databricks_pg_ident.conf"); + } else { + info!("pg_ident.conf already included databricks_pg_ident.conf"); + } + } + + Ok(()) +} + +/// Copy tls key_file and cert_file from k8s secret mount directory +/// to pgdata and set private key file permissions as expected by Postgres. +/// See this doc for expected permission +/// K8s secrets mount on dblet does not honor permission and ownership +/// specified in the Volume or VolumeMount. So we need to explicitly copy the file and set the permissions. +pub fn copy_tls_certificates( + key_file: &String, + cert_file: &String, + pgdata_path: &Path, +) -> Result<()> { + let files = [cert_file, key_file]; + for file in files.iter() { + let source = Path::new(file); + let dest = pgdata_path.join(source.file_name().unwrap()); + if !dest.exists() { + std::fs::copy(source, &dest)?; + info!( + "Copying tls file: {} to {}", + &source.display(), + &dest.display() + ); + } + if *file == key_file { + // Postgres requires private key to be readable only by the owner by having + // chmod 600 permissions. + let permissions = Permissions::from_mode(0o600); + fs::set_permissions(&dest, permissions)?; + info!("Setting permission on {}.", &dest.display()); + } + } + Ok(()) +} + /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of config.json @@ -169,7 +240,11 @@ pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { } #[instrument(skip_all)] -pub async fn handle_migrations(client: &mut Client) -> Result<()> { +pub async fn handle_migrations( + params: ComputeNodeParams, + client: &mut Client, + lakebase_mode: bool, +) -> Result<()> { info!("handle migrations"); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -178,29 +253,62 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> { // Add new migrations in numerical order. let migrations = [ - include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), - include_str!("./migrations/0002-alter_roles.sql"), - include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), - include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"), - include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"), - include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"), - include_str!( - "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + &format!( + include_str!("./migrations/0001-add_bypass_rls_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name ), - include_str!( - "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + &format!( + include_str!("./migrations/0002-alter_roles.sql"), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!("./migrations/0003-grant_pg_create_subscription_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!("./migrations/0004-grant_pg_monitor_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!("./migrations/0005-grant_all_on_tables_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!("./migrations/0006-grant_all_on_sequences_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!( + "./migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql" + ), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!( + "./migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql" + ), + privileged_role_name = params.privileged_role_name ), include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), - include_str!( - "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" + &format!( + include_str!( + "./migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql" + ), + privileged_role_name = params.privileged_role_name ), - include_str!( - "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" + &format!( + include_str!( + "./migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql" + ), + privileged_role_name = params.privileged_role_name + ), + &format!( + include_str!("./migrations/0012-grant_pg_signal_backend_to_privileged_role.sql"), + privileged_role_name = params.privileged_role_name ), - include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"), ]; - MigrationRunner::new(client, &migrations) + MigrationRunner::new(client, &migrations, lakebase_mode) .run_migrations() .await?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index fcd072263a..47bf61ae1b 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -13,14 +13,14 @@ use tokio_postgres::Client; use tokio_postgres::error::SqlState; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; -use crate::compute::{ComputeNode, ComputeState}; +use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState}; use crate::pg_helpers::{ DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, get_existing_roles_async, }; use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser, - CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon, + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension, + CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon, DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, @@ -49,6 +49,7 @@ impl ComputeNode { // Proceed with post-startup configuration. Note, that order of operations is important. let client = Self::get_maintenance_client(&conf).await?; let spec = spec.clone(); + let params = Arc::new(self.params.clone()); let databases = get_existing_dbs_async(&client).await?; let roles = get_existing_roles_async(&client) @@ -157,6 +158,7 @@ impl ComputeNode { let conf = Arc::new(conf); let fut = Self::apply_spec_sql_db( + params.clone(), spec.clone(), conf, ctx.clone(), @@ -185,7 +187,7 @@ impl ComputeNode { } for phase in [ - CreateNeonSuperuser, + CreatePrivilegedRole, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -195,6 +197,7 @@ impl ComputeNode { ] { info!("Applying phase {:?}", &phase); apply_operations( + params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), @@ -243,6 +246,7 @@ impl ComputeNode { } let fut = Self::apply_spec_sql_db( + params.clone(), spec.clone(), conf, ctx.clone(), @@ -293,6 +297,7 @@ impl ComputeNode { for phase in phases { debug!("Applying phase {:?}", &phase); apply_operations( + params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), @@ -313,7 +318,9 @@ impl ComputeNode { /// May opt to not connect to databases that don't have any scheduled /// operations. The function is concurrency-controlled with the provided /// semaphore. The caller has to make sure the semaphore isn't exhausted. + #[allow(clippy::too_many_arguments)] // TODO: needs bigger refactoring async fn apply_spec_sql_db( + params: Arc, spec: Arc, conf: Arc, ctx: Arc>, @@ -328,6 +335,7 @@ impl ComputeNode { for subphase in subphases { apply_operations( + params.clone(), spec.clone(), ctx.clone(), jwks_roles.clone(), @@ -403,7 +411,8 @@ impl ComputeNode { .map(|limit| match limit { 0..10 => limit, 10..30 => 10, - 30.. => limit / 3, + 30..300 => limit / 3, + 300.. => 100, }) // If we didn't find max_connections, default to 10 concurrent connections. .unwrap_or(10) @@ -467,7 +476,7 @@ pub enum PerDatabasePhase { #[derive(Clone, Debug)] pub enum ApplySpecPhase { - CreateNeonSuperuser, + CreatePrivilegedRole, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -510,6 +519,7 @@ pub struct MutableApplyContext { /// - No timeouts have (yet) been implemented. /// - The caller is responsible for limiting and/or applying concurrency. pub async fn apply_operations<'a, Fut, F>( + params: Arc, spec: Arc, ctx: Arc>, jwks_roles: Arc>, @@ -527,7 +537,7 @@ where debug!("Processing phase {:?}", &apply_spec_phase); let ctx = ctx; - let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase) + let mut ops = get_operations(¶ms, &spec, &ctx, &jwks_roles, &apply_spec_phase) .await? .peekable(); @@ -588,14 +598,18 @@ where /// sort/merge/batch execution, but for now this is a nice way to improve /// batching behavior of the commands. async fn get_operations<'a>( + params: &'a ComputeNodeParams, spec: &'a ComputeSpec, ctx: &'a RwLock, jwks_roles: &'a HashSet, apply_spec_phase: &'a ApplySpecPhase, ) -> Result + 'a + Send>> { match apply_spec_phase { - ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation { - query: include_str!("sql/create_neon_superuser.sql").to_string(), + ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation { + query: format!( + include_str!("sql/create_privileged_role.sql"), + privileged_role_name = params.privileged_role_name + ), comment: None, }))), ApplySpecPhase::DropInvalidDatabases => { @@ -697,8 +711,9 @@ async fn get_operations<'a>( None => { let query = if !jwks_roles.contains(role.name.as_str()) { format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}", + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE {} {}", role.name.pg_quote(), + params.privileged_role_name, role.to_pg_options(), ) } else { @@ -849,8 +864,9 @@ async fn get_operations<'a>( // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database // (see https://www.postgresql.org/docs/current/ddl-priv.html) query: format!( - "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", - db.name.pg_quote() + "GRANT ALL PRIVILEGES ON DATABASE {} TO {}", + db.name.pg_quote(), + params.privileged_role_name ), comment: None, }, diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql deleted file mode 100644 index 300645627b..0000000000 --- a/compute_tools/src/sql/create_neon_superuser.sql +++ /dev/null @@ -1,8 +0,0 @@ -DO $$ - BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') - THEN - CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; - END IF; - END -$$; diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql new file mode 100644 index 0000000000..df27ac32fc --- /dev/null +++ b/compute_tools/src/sql/create_privileged_role.sql @@ -0,0 +1,8 @@ +DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}') + THEN + CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; + END IF; + END +$$; diff --git a/control_plane/README.md b/control_plane/README.md index aa6f935e27..60c6120d82 100644 --- a/control_plane/README.md +++ b/control_plane/README.md @@ -8,10 +8,10 @@ code changes locally, but not suitable for running production systems. ## Example: Start with Postgres 16 -To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. +To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands. ```shell -cargo neon init --pg-version 16 +cargo neon init cargo neon start cargo neon tenant create --set-default --pg-version 16 cargo neon endpoint create main --pg-version 16 diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6021933d6a..e036e9d44b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -407,6 +407,12 @@ struct StorageControllerStartCmdArgs { help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" )] base_port: Option, + + #[clap( + long, + help = "Whether the storage controller should handle pageserver-reported local disk loss events." + )] + handle_ps_local_disk_loss: Option, } #[derive(clap::Args)] @@ -631,6 +637,10 @@ struct EndpointCreateCmdArgs { help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." )] allow_multiple: bool, + + /// Only allow changing it on creation + #[clap(long, help = "Name of the privileged role for the endpoint")] + privileged_role_name: Option, } #[derive(clap::Args)] @@ -1480,6 +1490,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res args.grpc, !args.update_catalog, false, + args.privileged_role_name.clone(), )?; } EndpointCmd::Start(args) => { @@ -1804,6 +1815,7 @@ async fn handle_storage_controller( instance_id: args.instance_id, base_port: args.base_port, start_timeout: args.start_timeout, + handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, }; if let Err(e) = svc.start(start_args).await { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 91a62b0ca4..4c569d7005 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -65,7 +65,6 @@ use jsonwebtoken::jwk::{ OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse, }; use nix::sys::signal::{Signal, kill}; -use pageserver_api::shard::ShardStripeSize; use pem::Pem; use reqwest::header::CONTENT_TYPE; use safekeeper_api::PgMajorVersion; @@ -77,6 +76,7 @@ use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef}; use tracing::debug; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::shard::ShardStripeSize; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; @@ -99,6 +99,7 @@ pub struct EndpointConf { features: Vec, cluster: Option, compute_ctl_config: ComputeCtlConfig, + privileged_role_name: Option, } // @@ -199,6 +200,7 @@ impl ComputeControlPlane { grpc: bool, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, + privileged_role_name: Option, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); @@ -236,6 +238,7 @@ impl ComputeControlPlane { features: vec![], cluster: None, compute_ctl_config: compute_ctl_config.clone(), + privileged_role_name: privileged_role_name.clone(), }); ep.create_endpoint_dir()?; @@ -257,6 +260,7 @@ impl ComputeControlPlane { features: vec![], cluster: None, compute_ctl_config, + privileged_role_name, })?, )?; std::fs::write( @@ -332,6 +336,9 @@ pub struct Endpoint { /// The compute_ctl config for the endpoint's compute. compute_ctl_config: ComputeCtlConfig, + + /// The name of the privileged role for the endpoint. + privileged_role_name: Option, } #[derive(PartialEq, Eq)] @@ -432,6 +439,7 @@ impl Endpoint { features: conf.features, cluster: conf.cluster, compute_ctl_config: conf.compute_ctl_config, + privileged_role_name: conf.privileged_role_name, }) } @@ -464,7 +472,7 @@ impl Endpoint { conf.append("max_connections", "100"); conf.append("wal_level", "logical"); // wal_sender_timeout is the maximum time to wait for WAL replication. - // It also defines how often the walreciever will send a feedback message to the wal sender. + // It also defines how often the walreceiver will send a feedback message to the wal sender. conf.append("wal_sender_timeout", "5s"); conf.append("listen_addresses", &self.pg_address.ip().to_string()); conf.append("port", &self.pg_address.port().to_string()); @@ -870,6 +878,10 @@ impl Endpoint { cmd.arg("--dev"); } + if let Some(privileged_role_name) = self.privileged_role_name.clone() { + cmd.args(["--privileged-role-name", &privileged_role_name]); + } + let child = cmd.spawn()?; // set up a scopeguard to kill & wait for the child in case we panic or bail below let child = scopeguard::guard(child, |mut child| { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f996f39967..35a197112e 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -56,6 +56,7 @@ pub struct NeonStorageControllerStartArgs { pub instance_id: u8, pub base_port: Option, pub start_timeout: humantime::Duration, + pub handle_ps_local_disk_loss: Option, } impl NeonStorageControllerStartArgs { @@ -64,6 +65,7 @@ impl NeonStorageControllerStartArgs { instance_id: 1, base_port: None, start_timeout, + handle_ps_local_disk_loss: None, } } } @@ -669,6 +671,10 @@ impl StorageController { println!("Starting storage controller at {scheme}://{host}:{listen_port}"); + if start_args.handle_ps_local_disk_loss.unwrap_or_default() { + args.push("--handle-ps-local-disk-loss".to_string()); + } + background_process::start_process( COMMAND, &instance_dir, diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index fcc5549beb..a4d1030488 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -76,6 +76,12 @@ enum Command { NodeStartDelete { #[arg(long)] node_id: NodeId, + /// When `force` is true, skip waiting for shards to prewarm during migration. + /// This can significantly speed up node deletion since prewarming all shards + /// can take considerable time, but may result in slower initial access to + /// migrated shards until they warm up naturally. + #[arg(long)] + force: bool, }, /// Cancel deletion of the specified pageserver and wait for `timeout` /// for the operation to be canceled. May be retried. @@ -952,13 +958,14 @@ async fn main() -> anyhow::Result<()> { .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) .await?; } - Command::NodeStartDelete { node_id } => { + Command::NodeStartDelete { node_id, force } => { + let query = if force { + format!("control/v1/node/{node_id}/delete?force=true") + } else { + format!("control/v1/node/{node_id}/delete") + }; storcon_client - .dispatch::<(), ()>( - Method::PUT, - format!("control/v1/node/{node_id}/delete"), - None, - ) + .dispatch::<(), ()>(Method::PUT, query, None) .await?; println!("Delete started for {node_id}"); } diff --git a/deny.toml b/deny.toml index be1c6a2f2c..7afd05a837 100644 --- a/deny.toml +++ b/deny.toml @@ -35,6 +35,7 @@ reason = "The paste crate is a build-only dependency with no runtime components. # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] +version = 2 allow = [ "0BSD", "Apache-2.0", diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index 11d984eb08..3c430c6236 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -75,7 +75,7 @@ CLI examples: * AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. -For local S3 installations, refer to the their documentation for name format and credentials. +For local S3 installations, refer to their documentation for name format and credentials. Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. Required sections are: diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index a7a18743ef..64c21cc8b9 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -233,7 +233,7 @@ mod tests { .unwrap() .as_millis(); use rand::Rng; - let random = rand::thread_rng().r#gen::(); + let random = rand::rng().random::(); let s3_config = remote_storage::S3Config { bucket_name: var(REAL_S3_BUCKET).unwrap(), diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 2fe233214a..5b8fc49750 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,16 +46,33 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } +/// Status of the LFC prewarm process. The same state machine is reused for +/// both autoprewarm (prewarm after compute/Postgres start using the previously +/// stored LFC state) and explicit prewarming via API. #[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { + /// Default value when compute boots up. #[default] NotPrewarmed, + /// Prewarming thread is active and loading pages into LFC. Prewarming, + /// We found requested LFC state in the endpoint storage and + /// completed prewarming successfully. Completed, - Failed { - error: String, - }, + /// Unexpected error happened during prewarming. Note, `Not Found 404` + /// response from the endpoint storage is explicitly excluded here + /// because it can normally happen on the first compute start, + /// since LFC state is not available yet. + Failed { error: String }, + /// We tried to fetch the corresponding LFC state from the endpoint storage, + /// but received `Not Found 404`. This should normally happen only during the + /// first endpoint start after creation with `autoprewarm: true`. + /// + /// During the orchestrated prewarm via API, when a caller explicitly + /// provides the LFC state key to prewarm from, it's the caller responsibility + /// to handle this status as an error state in this case. + Skipped, } impl Display for LfcPrewarmState { @@ -64,6 +81,7 @@ impl Display for LfcPrewarmState { LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), LfcPrewarmState::Prewarming => f.write_str("Prewarming"), LfcPrewarmState::Completed => f.write_str("Completed"), + LfcPrewarmState::Skipped => f.write_str("Skipped"), LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), } } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 0eeab2bebc..061ac3e66d 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -416,6 +416,32 @@ pub struct GenericOption { pub vartype: String, } +/// Postgres compute TLS settings. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct PgComputeTlsSettings { + // Absolute path to the certificate file for server-side TLS. + pub cert_file: String, + // Absolute path to the private key file for server-side TLS. + pub key_file: String, + // Absolute path to the certificate authority file for verifying client certificates. + pub ca_file: String, +} + +/// Databricks specific options for compute instance. +/// This is used to store any other settings that needs to be propagate to Compute +/// but should not be persisted to ComputeSpec in the database. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct DatabricksSettings { + pub pg_compute_tls_settings: PgComputeTlsSettings, + // Absolute file path to databricks_pg_hba.conf file. + pub databricks_pg_hba: String, + // Absolute file path to databricks_pg_ident.conf file. + pub databricks_pg_ident: String, + // Hostname portion of the Databricks workspace URL of the endpoint, or empty string if not known. + // A valid hostname is required for the compute instance to support PAT logins. + pub databricks_workspace_host: String, +} + /// Optional collection of `GenericOption`'s. Type alias allows us to /// declare a `trait` on it. pub type GenericOptions = Option>; diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 448134f31a..aeb33bdfc2 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -90,7 +90,7 @@ impl<'a> IdempotencyKey<'a> { IdempotencyKey { now: Utc::now(), node_id, - nonce: rand::thread_rng().gen_range(0..=9999), + nonce: rand::rng().random_range(0..=9999), } } diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs index e0cde7b284..6517c2001e 100644 --- a/libs/desim/src/node_os.rs +++ b/libs/desim/src/node_os.rs @@ -41,7 +41,7 @@ impl NodeOs { /// Generate a random number in range [0, max). pub fn random(&self, max: u64) -> u64 { - self.internal.rng.lock().gen_range(0..max) + self.internal.rng.lock().random_range(0..max) } /// Append a new event to the world event log. diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs index 9b1a42fd28..d5da008ef1 100644 --- a/libs/desim/src/options.rs +++ b/libs/desim/src/options.rs @@ -32,10 +32,10 @@ impl Delay { /// Generate a random delay in range [min, max]. Return None if the /// message should be dropped. pub fn delay(&self, rng: &mut StdRng) -> Option { - if rng.gen_bool(self.fail_prob) { + if rng.random_bool(self.fail_prob) { return None; } - Some(rng.gen_range(self.min..=self.max)) + Some(rng.random_range(self.min..=self.max)) } } diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs index 576ba89cd7..690d45f373 100644 --- a/libs/desim/src/world.rs +++ b/libs/desim/src/world.rs @@ -69,7 +69,7 @@ impl World { /// Create a new random number generator. pub fn new_rng(&self) -> StdRng { let mut rng = self.rng.lock(); - StdRng::from_rng(rng.deref_mut()).unwrap() + StdRng::from_rng(rng.deref_mut()) } /// Create a new node. diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f87e7b8e3a..1718ddfae2 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -17,5 +17,5 @@ procfs.workspace = true measured-process.workspace = true [dev-dependencies] -rand = "0.8" -rand_distr = "0.4.3" +rand.workspace = true +rand_distr = "0.5" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index 1a7d7a7e44..81e5bafbdf 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -260,7 +260,7 @@ mod tests { #[test] fn test_cardinality_small() { - let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap()); + let (actual, estimate) = test_cardinality(100, Zipf::new(100.0, 1.2f64).unwrap()); assert_eq!(actual, [46, 30, 32]); assert!(51.3 < estimate[0] && estimate[0] < 51.4); @@ -270,7 +270,7 @@ mod tests { #[test] fn test_cardinality_medium() { - let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap()); + let (actual, estimate) = test_cardinality(10000, Zipf::new(10000.0, 1.2f64).unwrap()); assert_eq!(actual, [2529, 1618, 1629]); assert!(2309.1 < estimate[0] && estimate[0] < 2309.2); @@ -280,7 +280,8 @@ mod tests { #[test] fn test_cardinality_large() { - let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap()); + let (actual, estimate) = + test_cardinality(1_000_000, Zipf::new(1_000_000.0, 1.2f64).unwrap()); assert_eq!(actual, [129077, 79579, 79630]); assert!(126067.2 < estimate[0] && estimate[0] < 126067.3); @@ -290,7 +291,7 @@ mod tests { #[test] fn test_cardinality_small2() { - let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap()); + let (actual, estimate) = test_cardinality(100, Zipf::new(200.0, 0.8f64).unwrap()); assert_eq!(actual, [92, 58, 60]); assert!(116.1 < estimate[0] && estimate[0] < 116.2); @@ -300,7 +301,7 @@ mod tests { #[test] fn test_cardinality_medium2() { - let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap()); + let (actual, estimate) = test_cardinality(10000, Zipf::new(20000.0, 0.8f64).unwrap()); assert_eq!(actual, [8201, 5131, 5051]); assert!(6846.4 < estimate[0] && estimate[0] < 6846.5); @@ -310,7 +311,8 @@ mod tests { #[test] fn test_cardinality_large2() { - let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap()); + let (actual, estimate) = + test_cardinality(1_000_000, Zipf::new(2_000_000.0, 0.8f64).unwrap()); assert_eq!(actual, [777847, 482069, 482246]); assert!(699437.4 < estimate[0] && estimate[0] < 699437.5); diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 5d028ee041..41873cdcd6 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,12 +4,14 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use std::sync::RwLock; + use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; use measured::metric::counter::CounterState; use measured::metric::gauge::GaugeState; use measured::metric::group::Encoding; use measured::metric::name::{MetricName, MetricNameEncoder}; -use measured::metric::{MetricEncoding, MetricFamilyEncoding}; +use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use once_cell::sync::Lazy; use prometheus::Registry; @@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec { .collect() } +pub struct InfoMetric { + label: RwLock, + metric: M, +} + +impl InfoMetric { + pub fn new(label: L) -> Self { + Self::with_metric(label, GaugeState::new(1)) + } +} + +impl> InfoMetric { + pub fn with_metric(label: L, metric: M) -> Self { + Self { + label: RwLock::new(label), + metric, + } + } + + pub fn set_label(&self, label: L) { + *self.label.write().unwrap() = label; + } +} + +impl MetricFamilyEncoding for InfoMetric +where + L: LabelGroup, + M: MetricEncoding, + E: Encoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut E, + ) -> Result<(), E::Err> { + M::write_type(&name, enc)?; + self.metric + .collect_into(&(), &*self.label.read().unwrap(), name, enc) + } +} + pub struct BuildInfo { pub revision: &'static str, pub build_tag: &'static str, } -// todo: allow label group without the set impl LabelGroup for BuildInfo { fn visit_values(&self, v: &mut impl LabelGroupVisitor) { const REVISION: &LabelName = LabelName::from_str("revision"); @@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo { } } -impl MetricFamilyEncoding for BuildInfo -where - GaugeState: MetricEncoding, -{ - fn collect_family_into( - &self, - name: impl measured::metric::name::MetricNameEncoder, - enc: &mut T, - ) -> Result<(), T::Err> { - enc.write_help(&name, "Build/version information")?; - GaugeState::write_type(&name, enc)?; - GaugeState { - count: std::sync::atomic::AtomicI64::new(1), - } - .collect_into(&(), self, name, enc) - } -} - #[derive(MetricGroup)] #[metric(new(build_info: BuildInfo))] pub struct NeonMetrics { @@ -165,8 +189,8 @@ pub struct NeonMetrics { #[derive(MetricGroup)] #[metric(new(build_info: BuildInfo))] pub struct LibMetrics { - #[metric(init = build_info)] - build_info: BuildInfo, + #[metric(init = InfoMetric::new(build_info))] + build_info: InfoMetric, #[metric(flatten)] rusage: Rusage, diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml index 2a636bec40..1cdc9c0c67 100644 --- a/libs/neon-shmem/Cargo.toml +++ b/libs/neon-shmem/Cargo.toml @@ -8,6 +8,13 @@ license.workspace = true thiserror.workspace = true nix.workspace=true workspace_hack = { version = "0.1", path = "../../workspace_hack" } +libc.workspace = true +lock_api.workspace = true +rustc-hash.workspace = true [target.'cfg(target_os = "macos")'.dependencies] tempfile = "3.14.0" + +[dev-dependencies] +rand.workspace = true +rand_distr = "0.5.1" diff --git a/libs/neon-shmem/src/hash.rs b/libs/neon-shmem/src/hash.rs new file mode 100644 index 0000000000..58726b9ba3 --- /dev/null +++ b/libs/neon-shmem/src/hash.rs @@ -0,0 +1,583 @@ +//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array). +//! +//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the +//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an +//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash +//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash). +//! +//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash- +//! dependent component is done with the dictionary. When a new key is inserted into the map, a position +//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based +//! off of the freelist, and then the index of said bucket is placed in the dictionary. +//! +//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen +//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the +//! dictionary by rehashing all keys. +//! +//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock. + +use std::hash::{BuildHasher, Hash}; +use std::mem::MaybeUninit; + +use crate::shmem::ShmemHandle; +use crate::{shmem, sync::*}; + +mod core; +pub mod entry; + +#[cfg(test)] +mod tests; + +use core::{Bucket, CoreHashMap, INVALID_POS}; +use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry}; + +use thiserror::Error; + +/// Error type for a hashmap shrink operation. +#[derive(Error, Debug)] +pub enum HashMapShrinkError { + /// There was an error encountered while resizing the memory area. + #[error("shmem resize failed: {0}")] + ResizeError(shmem::Error), + /// Occupied entries in to-be-shrunk space were encountered beginning at the given index. + #[error("occupied entry in deallocated space found at {0}")] + RemainingEntries(usize), +} + +/// This represents a hash table that (possibly) lives in shared memory. +/// If a new process is launched with fork(), the child process inherits +/// this struct. +#[must_use] +pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> { + shmem_handle: Option, + shared_ptr: *mut HashMapShared<'a, K, V>, + shared_size: usize, + hasher: S, + num_buckets: u32, +} + +/// This is a per-process handle to a hash table that (possibly) lives in shared memory. +/// If a child process is launched with fork(), the child process should +/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader(). +/// +/// XXX: We're not making use of it at the moment, but this struct could +/// hold process-local information in the future. +pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> { + shmem_handle: Option, + shared_ptr: *mut HashMapShared<'a, K, V>, + hasher: S, +} + +unsafe impl Sync for HashMapAccess<'_, K, V, S> {} +unsafe impl Send for HashMapAccess<'_, K, V, S> {} + +impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> { + /// Change the 'hasher' used by the hash table. + /// + /// NOTE: This must be called right after creating the hash table, + /// before inserting any entries and before calling attach_writer/reader. + /// Otherwise different accessors could be using different hash function, + /// with confusing results. + pub fn with_hasher(self, hasher: T) -> HashMapInit<'a, K, V, T> { + HashMapInit { + hasher, + shmem_handle: self.shmem_handle, + shared_ptr: self.shared_ptr, + shared_size: self.shared_size, + num_buckets: self.num_buckets, + } + } + + /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets. + pub fn estimate_size(num_buckets: u32) -> usize { + // add some margin to cover alignment etc. + CoreHashMap::::estimate_size(num_buckets) + size_of::>() + 1000 + } + + fn new( + num_buckets: u32, + shmem_handle: Option, + area_ptr: *mut u8, + area_size: usize, + hasher: S, + ) -> Self { + let mut ptr: *mut u8 = area_ptr; + let end_ptr: *mut u8 = unsafe { ptr.add(area_size) }; + + // carve out area for the One Big Lock (TM) and the HashMapShared. + ptr = unsafe { ptr.add(ptr.align_offset(align_of::())) }; + let raw_lock_ptr = ptr; + ptr = unsafe { ptr.add(size_of::()) }; + ptr = unsafe { ptr.add(ptr.align_offset(align_of::>())) }; + let shared_ptr: *mut HashMapShared = ptr.cast(); + ptr = unsafe { ptr.add(size_of::>()) }; + + // carve out the buckets + ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::>())) }; + let buckets_ptr = ptr; + ptr = unsafe { ptr.add(size_of::>() * num_buckets as usize) }; + + // use remaining space for the dictionary + ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::())) }; + assert!(ptr.addr() < end_ptr.addr()); + let dictionary_ptr = ptr; + let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::() as isize }; + assert!(dictionary_size > 0); + + let buckets = + unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }; + let dictionary = unsafe { + std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize) + }; + + let hashmap = CoreHashMap::new(buckets, dictionary); + unsafe { + let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap); + std::ptr::write(shared_ptr, lock); + } + + Self { + num_buckets, + shmem_handle, + shared_ptr, + shared_size: area_size, + hasher, + } + } + + /// Attach to a hash table for writing. + pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> { + HashMapAccess { + shmem_handle: self.shmem_handle, + shared_ptr: self.shared_ptr, + hasher: self.hasher, + } + } + + /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`]. + /// + /// This is a holdover from a previous implementation and is being kept around for + /// backwards compatibility reasons. + pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> { + self.attach_writer() + } +} + +/// Hash table data that is actually stored in the shared memory area. +/// +/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table +/// relies on the memory layout! The data structures are laid out in the contiguous shared memory +/// area as follows: +/// +/// [`libc::pthread_rwlock_t`] +/// [`HashMapShared`] +/// buckets +/// dictionary +/// +/// In between the above parts, there can be padding bytes to align the parts correctly. +type HashMapShared<'a, K, V> = RwLock>; + +impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher> +where + K: Clone + Hash + Eq, +{ + /// Place the hash table within a user-supplied fixed memory area. + pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit]) -> Self { + Self::new( + num_buckets, + None, + area.as_mut_ptr().cast(), + area.len(), + rustc_hash::FxBuildHasher, + ) + } + + /// Place a new hash map in the given shared memory area + /// + /// # Panics + /// Will panic on failure to resize area to expected map size. + pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self { + let size = Self::estimate_size(num_buckets); + shmem + .set_size(size) + .expect("could not resize shared memory area"); + let ptr = shmem.data_ptr.as_ptr().cast(); + Self::new( + num_buckets, + Some(shmem), + ptr, + size, + rustc_hash::FxBuildHasher, + ) + } + + /// Make a resizable hash map within a new shared memory area with the given name. + pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self { + let size = Self::estimate_size(num_buckets); + let max_size = Self::estimate_size(max_buckets); + let shmem = + ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area"); + let ptr = shmem.data_ptr.as_ptr().cast(); + + Self::new( + num_buckets, + Some(shmem), + ptr, + size, + rustc_hash::FxBuildHasher, + ) + } + + /// Make a resizable hash map within a new anonymous shared memory area. + pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self { + use std::sync::atomic::{AtomicUsize, Ordering}; + static COUNTER: AtomicUsize = AtomicUsize::new(0); + let val = COUNTER.fetch_add(1, Ordering::Relaxed); + let name = format!("neon_shmem_hmap{val}"); + Self::new_resizeable_named(num_buckets, max_buckets, &name) + } +} + +impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S> +where + K: Clone + Hash + Eq, +{ + /// Hash a key using the map's hasher. + #[inline] + fn get_hash_value(&self, key: &K) -> u64 { + self.hasher.hash_one(key) + } + + fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> { + let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write(); + let dict_pos = hash as usize % map.dictionary.len(); + let first = map.dictionary[dict_pos]; + if first == INVALID_POS { + // no existing entry + return Entry::Vacant(VacantEntry { + map, + key, + dict_pos: dict_pos as u32, + }); + } + + let mut prev_pos = PrevPos::First(dict_pos as u32); + let mut next = first; + loop { + let bucket = &mut map.buckets[next as usize]; + let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use"); + if *bucket_key == key { + // found existing entry + return Entry::Occupied(OccupiedEntry { + map, + _key: key, + prev_pos, + bucket_pos: next, + }); + } + + if bucket.next == INVALID_POS { + // No existing entry + return Entry::Vacant(VacantEntry { + map, + key, + dict_pos: dict_pos as u32, + }); + } + prev_pos = PrevPos::Chained(next); + next = bucket.next; + } + } + + /// Get a reference to the corresponding value for a key. + pub fn get<'e>(&'e self, key: &K) -> Option> { + let hash = self.get_hash_value(key); + let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); + RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok() + } + + /// Get a reference to the entry containing a key. + /// + /// NB: THis takes a write lock as there's no way to distinguish whether the intention + /// is to use the entry for reading or for writing in advance. + pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> { + let hash = self.get_hash_value(&key); + self.entry_with_hash(key, hash) + } + + /// Remove a key given its hash. Returns the associated value if it existed. + pub fn remove(&self, key: &K) -> Option { + let hash = self.get_hash_value(key); + match self.entry_with_hash(key.clone(), hash) { + Entry::Occupied(e) => Some(e.remove()), + Entry::Vacant(_) => None, + } + } + + /// Insert/update a key. Returns the previous associated value if it existed. + /// + /// # Errors + /// Will return [`core::FullError`] if there is no more space left in the map. + pub fn insert(&self, key: K, value: V) -> Result, core::FullError> { + let hash = self.get_hash_value(&key); + match self.entry_with_hash(key.clone(), hash) { + Entry::Occupied(mut e) => Ok(Some(e.insert(value))), + Entry::Vacant(e) => { + _ = e.insert(value)?; + Ok(None) + } + } + } + + /// Optionally return the entry for a bucket at a given index if it exists. + /// + /// Has more overhead than one would intuitively expect: performs both a clone of the key + /// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order + /// to enable repairing the hash chain if the entry is removed. + pub fn entry_at_bucket(&self, pos: usize) -> Option> { + let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + if pos >= map.buckets.len() { + return None; + } + + let entry = map.buckets[pos].inner.as_ref(); + match entry { + Some((key, _)) => Some(OccupiedEntry { + _key: key.clone(), + bucket_pos: pos as u32, + prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)), + map, + }), + _ => None, + } + } + + /// Returns the number of buckets in the table. + pub fn get_num_buckets(&self) -> usize { + let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); + map.get_num_buckets() + } + + /// Return the key and value stored in bucket with given index. This can be used to + /// iterate through the hash map. + // TODO: An Iterator might be nicer. The communicator's clock algorithm needs to + // _slowly_ iterate through all buckets with its clock hand, without holding a lock. + // If we switch to an Iterator, it must not hold the lock. + pub fn get_at_bucket(&self, pos: usize) -> Option> { + let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); + if pos >= map.buckets.len() { + return None; + } + RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok() + } + + /// Returns the index of the bucket a given value corresponds to. + pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize { + let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); + + let origin = map.buckets.as_ptr(); + let idx = (val_ptr as usize - origin as usize) / size_of::>(); + assert!(idx < map.buckets.len()); + + idx + } + + /// Returns the number of occupied buckets in the table. + pub fn get_num_buckets_in_use(&self) -> usize { + let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read(); + map.buckets_in_use as usize + } + + /// Clears all entries in a table. Does not reset any shrinking operations. + pub fn clear(&self) { + let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + map.clear(); + } + + /// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset + /// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist + /// in the process. + fn rehash_dict( + &self, + inner: &mut CoreHashMap<'a, K, V>, + buckets_ptr: *mut core::Bucket, + end_ptr: *mut u8, + num_buckets: u32, + rehash_buckets: u32, + ) { + inner.free_head = INVALID_POS; + + let buckets; + let dictionary; + unsafe { + let buckets_end_ptr = buckets_ptr.add(num_buckets as usize); + let dictionary_ptr: *mut u32 = buckets_end_ptr + .byte_add(buckets_end_ptr.align_offset(align_of::())) + .cast(); + let dictionary_size: usize = + end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::(); + + buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize); + dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size); + } + for e in dictionary.iter_mut() { + *e = INVALID_POS; + } + + for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) { + if bucket.inner.is_none() { + bucket.next = inner.free_head; + inner.free_head = i as u32; + continue; + } + + let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0); + let pos: usize = (hash % dictionary.len() as u64) as usize; + bucket.next = dictionary[pos]; + dictionary[pos] = i as u32; + } + + inner.dictionary = dictionary; + inner.buckets = buckets; + } + + /// Rehash the map without growing or shrinking. + pub fn shuffle(&self) { + let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + let num_buckets = map.get_num_buckets() as u32; + let size_bytes = HashMapInit::::estimate_size(num_buckets); + let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() }; + let buckets_ptr = map.buckets.as_mut_ptr(); + self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets); + } + + /// Grow the number of buckets within the table. + /// + /// 1. Grows the underlying shared memory area + /// 2. Initializes new buckets and overwrites the current dictionary + /// 3. Rehashes the dictionary + /// + /// # Panics + /// Panics if called on a map initialized with [`HashMapInit::with_fixed`]. + /// + /// # Errors + /// Returns an [`shmem::Error`] if any errors occur resizing the memory region. + pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> { + let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + let old_num_buckets = map.buckets.len() as u32; + + assert!( + num_buckets >= old_num_buckets, + "grow called with a smaller number of buckets" + ); + if num_buckets == old_num_buckets { + return Ok(()); + } + let shmem_handle = self + .shmem_handle + .as_ref() + .expect("grow called on a fixed-size hash table"); + + let size_bytes = HashMapInit::::estimate_size(num_buckets); + shmem_handle.set_size(size_bytes)?; + let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) }; + + // Initialize new buckets. The new buckets are linked to the free list. + // NB: This overwrites the dictionary! + let buckets_ptr = map.buckets.as_mut_ptr(); + unsafe { + for i in old_num_buckets..num_buckets { + let bucket = buckets_ptr.add(i as usize); + bucket.write(core::Bucket { + next: if i < num_buckets - 1 { + i + 1 + } else { + map.free_head + }, + inner: None, + }); + } + } + + self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets); + map.free_head = old_num_buckets; + + Ok(()) + } + + /// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`. + /// + /// # Panics + /// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is + /// greater than the number of buckets in the map. + pub fn begin_shrink(&mut self, num_buckets: u32) { + let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + assert!( + num_buckets <= map.get_num_buckets() as u32, + "shrink called with a larger number of buckets" + ); + _ = self + .shmem_handle + .as_ref() + .expect("shrink called on a fixed-size hash table"); + map.alloc_limit = num_buckets; + } + + /// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None. + pub fn shrink_goal(&self) -> Option { + let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read(); + let goal = map.alloc_limit; + if goal == INVALID_POS { + None + } else { + Some(goal as usize) + } + } + + /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing. + /// + /// # Panics + /// The following cases result in a panic: + /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`]. + /// - Calling this function on a map when no shrink operation is in progress. + pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> { + let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write(); + assert!( + map.alloc_limit != INVALID_POS, + "called finish_shrink when no shrink is in progress" + ); + + let num_buckets = map.alloc_limit; + + if map.get_num_buckets() == num_buckets as usize { + return Ok(()); + } + + assert!( + map.buckets_in_use <= num_buckets, + "called finish_shrink before enough entries were removed" + ); + + for i in (num_buckets as usize)..map.buckets.len() { + if map.buckets[i].inner.is_some() { + return Err(HashMapShrinkError::RemainingEntries(i)); + } + } + + let shmem_handle = self + .shmem_handle + .as_ref() + .expect("shrink called on a fixed-size hash table"); + + let size_bytes = HashMapInit::::estimate_size(num_buckets); + if let Err(e) = shmem_handle.set_size(size_bytes) { + return Err(HashMapShrinkError::ResizeError(e)); + } + let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) }; + let buckets_ptr = map.buckets.as_mut_ptr(); + self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets); + map.alloc_limit = INVALID_POS; + + Ok(()) + } +} diff --git a/libs/neon-shmem/src/hash/core.rs b/libs/neon-shmem/src/hash/core.rs new file mode 100644 index 0000000000..4665c36adb --- /dev/null +++ b/libs/neon-shmem/src/hash/core.rs @@ -0,0 +1,174 @@ +//! Simple hash table with chaining. + +use std::hash::Hash; +use std::mem::MaybeUninit; + +use crate::hash::entry::*; + +/// Invalid position within the map (either within the dictionary or bucket array). +pub(crate) const INVALID_POS: u32 = u32::MAX; + +/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair. +/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full). +pub(crate) struct Bucket { + /// Index of next bucket in the chain. + pub(crate) next: u32, + /// Key-value pair contained within bucket. + pub(crate) inner: Option<(K, V)>, +} + +/// Core hash table implementation. +pub(crate) struct CoreHashMap<'a, K, V> { + /// Dictionary used to map hashes to bucket indices. + pub(crate) dictionary: &'a mut [u32], + /// Buckets containing key-value pairs. + pub(crate) buckets: &'a mut [Bucket], + /// Head of the freelist. + pub(crate) free_head: u32, + /// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit. + pub(crate) alloc_limit: u32, + /// The number of currently occupied buckets. + pub(crate) buckets_in_use: u32, +} + +/// Error for when there are no empty buckets left but one is needed. +#[derive(Debug, PartialEq)] +pub struct FullError; + +impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> { + const FILL_FACTOR: f32 = 0.60; + + /// Estimate the size of data contained within the the hash map. + pub fn estimate_size(num_buckets: u32) -> usize { + let mut size = 0; + + // buckets + size += size_of::>() * num_buckets as usize; + + // dictionary + size += (f32::ceil((size_of::() * num_buckets as usize) as f32 / Self::FILL_FACTOR)) + as usize; + + size + } + + pub fn new( + buckets: &'a mut [MaybeUninit>], + dictionary: &'a mut [MaybeUninit], + ) -> Self { + // Initialize the buckets + for i in 0..buckets.len() { + buckets[i].write(Bucket { + next: if i < buckets.len() - 1 { + i as u32 + 1 + } else { + INVALID_POS + }, + inner: None, + }); + } + + // Initialize the dictionary + for e in dictionary.iter_mut() { + e.write(INVALID_POS); + } + + // TODO: use std::slice::assume_init_mut() once it stabilizes + let buckets = + unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) }; + let dictionary = unsafe { + std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len()) + }; + + Self { + dictionary, + buckets, + free_head: 0, + buckets_in_use: 0, + alloc_limit: INVALID_POS, + } + } + + /// Get the value associated with a key (if it exists) given its hash. + pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> { + let mut next = self.dictionary[hash as usize % self.dictionary.len()]; + loop { + if next == INVALID_POS { + return None; + } + + let bucket = &self.buckets[next as usize]; + let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use"); + if bucket_key == key { + return Some(bucket_value); + } + next = bucket.next; + } + } + + /// Get number of buckets in map. + pub fn get_num_buckets(&self) -> usize { + self.buckets.len() + } + + /// Clears all entries from the hashmap. + /// + /// Does not reset any allocation limits, but does clear any entries beyond them. + pub fn clear(&mut self) { + for i in 0..self.buckets.len() { + self.buckets[i] = Bucket { + next: if i < self.buckets.len() - 1 { + i as u32 + 1 + } else { + INVALID_POS + }, + inner: None, + } + } + for i in 0..self.dictionary.len() { + self.dictionary[i] = INVALID_POS; + } + + self.free_head = 0; + self.buckets_in_use = 0; + } + + /// Find the position of an unused bucket via the freelist and initialize it. + pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result { + let mut pos = self.free_head; + + // Find the first bucket we're *allowed* to use. + let mut prev = PrevPos::First(self.free_head); + while pos != INVALID_POS && pos >= self.alloc_limit { + let bucket = &mut self.buckets[pos as usize]; + prev = PrevPos::Chained(pos); + pos = bucket.next; + } + if pos == INVALID_POS { + return Err(FullError); + } + + // Repair the freelist. + match prev { + PrevPos::First(_) => { + let next_pos = self.buckets[pos as usize].next; + self.free_head = next_pos; + } + PrevPos::Chained(p) => { + if p != INVALID_POS { + let next_pos = self.buckets[pos as usize].next; + self.buckets[p as usize].next = next_pos; + } + } + _ => unreachable!(), + } + + // Initialize the bucket. + let bucket = &mut self.buckets[pos as usize]; + self.buckets_in_use += 1; + bucket.next = INVALID_POS; + bucket.inner = Some((key, value)); + + Ok(pos) + } +} diff --git a/libs/neon-shmem/src/hash/entry.rs b/libs/neon-shmem/src/hash/entry.rs new file mode 100644 index 0000000000..560a20db1d --- /dev/null +++ b/libs/neon-shmem/src/hash/entry.rs @@ -0,0 +1,130 @@ +//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap. + +use crate::hash::core::{CoreHashMap, FullError, INVALID_POS}; +use crate::sync::{RwLockWriteGuard, ValueWriteGuard}; + +use std::hash::Hash; +use std::mem; + +pub enum Entry<'a, 'b, K, V> { + Occupied(OccupiedEntry<'a, 'b, K, V>), + Vacant(VacantEntry<'a, 'b, K, V>), +} + +/// Enum representing the previous position within a chain. +#[derive(Clone, Copy)] +pub(crate) enum PrevPos { + /// Starting index within the dictionary. + First(u32), + /// Regular index within the buckets. + Chained(u32), + /// Unknown - e.g. the associated entry was retrieved by index instead of chain. + Unknown(u64), +} + +pub struct OccupiedEntry<'a, 'b, K, V> { + /// Mutable reference to the map containing this entry. + pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>, + /// The key of the occupied entry + pub(crate) _key: K, + /// The index of the previous entry in the chain. + pub(crate) prev_pos: PrevPos, + /// The position of the bucket in the [`CoreHashMap`] bucket array. + pub(crate) bucket_pos: u32, +} + +impl OccupiedEntry<'_, '_, K, V> { + pub fn get(&self) -> &V { + &self.map.buckets[self.bucket_pos as usize] + .inner + .as_ref() + .unwrap() + .1 + } + + pub fn get_mut(&mut self) -> &mut V { + &mut self.map.buckets[self.bucket_pos as usize] + .inner + .as_mut() + .unwrap() + .1 + } + + /// Inserts a value into the entry, replacing (and returning) the existing value. + pub fn insert(&mut self, value: V) -> V { + let bucket = &mut self.map.buckets[self.bucket_pos as usize]; + // This assumes inner is Some, which it must be for an OccupiedEntry + mem::replace(&mut bucket.inner.as_mut().unwrap().1, value) + } + + /// Removes the entry from the hash map, returning the value originally stored within it. + /// + /// This may result in multiple bucket accesses if the entry was obtained by index as the + /// previous chain entry needs to be discovered in this case. + pub fn remove(mut self) -> V { + // If this bucket was queried by index, go ahead and follow its chain from the start. + let prev = if let PrevPos::Unknown(hash) = self.prev_pos { + let dict_idx = hash as usize % self.map.dictionary.len(); + let mut prev = PrevPos::First(dict_idx as u32); + let mut curr = self.map.dictionary[dict_idx]; + while curr != self.bucket_pos { + assert!(curr != INVALID_POS); + prev = PrevPos::Chained(curr); + curr = self.map.buckets[curr as usize].next; + } + prev + } else { + self.prev_pos + }; + + // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry. + let bucket = &mut self.map.buckets[self.bucket_pos as usize]; + + // unlink it from the chain + match prev { + PrevPos::First(dict_pos) => { + self.map.dictionary[dict_pos as usize] = bucket.next; + } + PrevPos::Chained(bucket_pos) => { + self.map.buckets[bucket_pos as usize].next = bucket.next; + } + _ => unreachable!(), + } + + // and add it to the freelist + let free = self.map.free_head; + let bucket = &mut self.map.buckets[self.bucket_pos as usize]; + let old_value = bucket.inner.take(); + bucket.next = free; + self.map.free_head = self.bucket_pos; + self.map.buckets_in_use -= 1; + + old_value.unwrap().1 + } +} + +/// An abstract view into a vacant entry within the map. +pub struct VacantEntry<'a, 'b, K, V> { + /// Mutable reference to the map containing this entry. + pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>, + /// The key to be inserted into this entry. + pub(crate) key: K, + /// The position within the dictionary corresponding to the key's hash. + pub(crate) dict_pos: u32, +} + +impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> { + /// Insert a value into the vacant entry, finding and populating an empty bucket in the process. + /// + /// # Errors + /// Will return [`FullError`] if there are no unoccupied buckets in the map. + pub fn insert(mut self, value: V) -> Result, FullError> { + let pos = self.map.alloc_bucket(self.key, value)?; + self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize]; + self.map.dictionary[self.dict_pos as usize] = pos; + + Ok(RwLockWriteGuard::map(self.map, |m| { + &mut m.buckets[pos as usize].inner.as_mut().unwrap().1 + })) + } +} diff --git a/libs/neon-shmem/src/hash/tests.rs b/libs/neon-shmem/src/hash/tests.rs new file mode 100644 index 0000000000..92233e8140 --- /dev/null +++ b/libs/neon-shmem/src/hash/tests.rs @@ -0,0 +1,428 @@ +use std::collections::BTreeMap; +use std::collections::HashSet; +use std::fmt::Debug; +use std::mem::MaybeUninit; + +use crate::hash::Entry; +use crate::hash::HashMapAccess; +use crate::hash::HashMapInit; +use crate::hash::core::FullError; + +use rand::seq::SliceRandom; +use rand::{Rng, RngCore}; +use rand_distr::Zipf; + +const TEST_KEY_LEN: usize = 16; + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct TestKey([u8; TEST_KEY_LEN]); + +impl From<&TestKey> for u128 { + fn from(val: &TestKey) -> u128 { + u128::from_be_bytes(val.0) + } +} + +impl From for TestKey { + fn from(val: u128) -> TestKey { + TestKey(val.to_be_bytes()) + } +} + +impl<'a> From<&'a [u8]> for TestKey { + fn from(bytes: &'a [u8]) -> TestKey { + TestKey(bytes.try_into().unwrap()) + } +} + +fn test_inserts + Copy>(keys: &[K]) { + let w = HashMapInit::::new_resizeable_named(100000, 120000, "test_inserts") + .attach_writer(); + + for (idx, k) in keys.iter().enumerate() { + let res = w.entry((*k).into()); + match res { + Entry::Occupied(mut e) => { + e.insert(idx); + } + Entry::Vacant(e) => { + let res = e.insert(idx); + assert!(res.is_ok()); + } + }; + } + + for (idx, k) in keys.iter().enumerate() { + let x = w.get(&(*k).into()); + let value = x.as_deref().copied(); + assert_eq!(value, Some(idx)); + } +} + +#[test] +fn dense() { + // This exercises splitting a node with prefix + let keys: &[u128] = &[0, 1, 2, 3, 256]; + test_inserts(keys); + + // Dense keys + let mut keys: Vec = (0..10000).collect(); + test_inserts(&keys); + + // Do the same in random orders + for _ in 1..10 { + keys.shuffle(&mut rand::rng()); + test_inserts(&keys); + } +} + +#[test] +fn sparse() { + // sparse keys + let mut keys: Vec = Vec::new(); + let mut used_keys = HashSet::new(); + for _ in 0..10000 { + loop { + let key = rand::random::(); + if used_keys.contains(&key) { + continue; + } + used_keys.insert(key); + keys.push(key.into()); + break; + } + } + test_inserts(&keys); +} + +#[derive(Clone, Debug)] +struct TestOp(TestKey, Option); + +fn apply_op( + op: &TestOp, + map: &mut HashMapAccess, + shadow: &mut BTreeMap, +) { + // apply the change to the shadow tree first + let shadow_existing = if let Some(v) = op.1 { + shadow.insert(op.0, v) + } else { + shadow.remove(&op.0) + }; + + let entry = map.entry(op.0); + let hash_existing = match op.1 { + Some(new) => match entry { + Entry::Occupied(mut e) => Some(e.insert(new)), + Entry::Vacant(e) => { + _ = e.insert(new).unwrap(); + None + } + }, + None => match entry { + Entry::Occupied(e) => Some(e.remove()), + Entry::Vacant(_) => None, + }, + }; + + assert_eq!(shadow_existing, hash_existing); +} + +fn do_random_ops( + num_ops: usize, + size: u32, + del_prob: f64, + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, + rng: &mut rand::rngs::ThreadRng, +) { + for i in 0..num_ops { + let key: TestKey = ((rng.next_u32() % size) as u128).into(); + let op = TestOp( + key, + if rng.random_bool(del_prob) { + Some(i) + } else { + None + }, + ); + apply_op(&op, writer, shadow); + } +} + +fn do_deletes( + num_ops: usize, + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, +) { + for _ in 0..num_ops { + let (k, _) = shadow.pop_first().unwrap(); + writer.remove(&k); + } +} + +fn do_shrink( + writer: &mut HashMapAccess, + shadow: &mut BTreeMap, + from: u32, + to: u32, +) { + assert!(writer.shrink_goal().is_none()); + writer.begin_shrink(to); + assert_eq!(writer.shrink_goal(), Some(to as usize)); + for i in to..from { + if let Some(entry) = writer.entry_at_bucket(i as usize) { + shadow.remove(&entry._key); + entry.remove(); + } + } + let old_usage = writer.get_num_buckets_in_use(); + writer.finish_shrink().unwrap(); + assert!(writer.shrink_goal().is_none()); + assert_eq!(writer.get_num_buckets_in_use(), old_usage); +} + +#[test] +fn random_ops() { + let mut writer = + HashMapInit::::new_resizeable_named(100000, 120000, "test_random") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + + let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap(); + let mut rng = rand::rng(); + for i in 0..100000 { + let key: TestKey = (rng.sample(distribution) as u128).into(); + + let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None }); + + apply_op(&op, &mut writer, &mut shadow); + } +} + +#[test] +fn test_shuffle() { + let mut writer = HashMapInit::::new_resizeable_named(1000, 1200, "test_shuf") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); + writer.shuffle(); + do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); +} + +#[test] +fn test_grow() { + let mut writer = HashMapInit::::new_resizeable_named(1000, 2000, "test_grow") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng); + let old_usage = writer.get_num_buckets_in_use(); + writer.grow(1500).unwrap(); + assert_eq!(writer.get_num_buckets_in_use(), old_usage); + assert_eq!(writer.get_num_buckets(), 1500); + do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); +} + +#[test] +fn test_clear() { + let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); + writer.clear(); + assert_eq!(writer.get_num_buckets_in_use(), 0); + assert_eq!(writer.get_num_buckets(), 1500); + while let Some((key, _)) = shadow.pop_first() { + assert!(writer.get(&key).is_none()); + } + do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); + for i in 0..(1500 - writer.get_num_buckets_in_use()) { + writer.insert((1500 + i as u128).into(), 0).unwrap(); + } + assert_eq!(writer.insert(5000.into(), 0), Err(FullError {})); + writer.clear(); + assert!(writer.insert(5000.into(), 0).is_ok()); +} + +#[test] +fn test_idx_remove() { + let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng); + for _ in 0..100 { + let idx = (rng.next_u32() % 1500) as usize; + if let Some(e) = writer.entry_at_bucket(idx) { + shadow.remove(&e._key); + e.remove(); + } + } + while let Some((key, val)) = shadow.pop_first() { + assert_eq!(*writer.get(&key).unwrap(), val); + } +} + +#[test] +fn test_idx_get() { + let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_clear") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng); + for _ in 0..100 { + let idx = (rng.next_u32() % 1500) as usize; + if let Some(pair) = writer.get_at_bucket(idx) { + { + let v: *const usize = &pair.1; + assert_eq!(writer.get_bucket_for_value(v), idx); + } + { + let v: *const usize = &pair.1; + assert_eq!(writer.get_bucket_for_value(v), idx); + } + } + } +} + +#[test] +fn test_shrink() { + let mut writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_shrink") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng); + do_shrink(&mut writer, &mut shadow, 1500, 1000); + assert_eq!(writer.get_num_buckets(), 1000); + do_deletes(500, &mut writer, &mut shadow); + do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng); + assert!(writer.get_num_buckets_in_use() <= 1000); +} + +#[test] +fn test_shrink_grow_seq() { + let mut writer = + HashMapInit::::new_resizeable_named(1000, 20000, "test_grow_seq") + .attach_writer(); + let mut shadow: std::collections::BTreeMap = BTreeMap::new(); + let mut rng = rand::rng(); + + do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng); + eprintln!("Shrinking to 750"); + do_shrink(&mut writer, &mut shadow, 1000, 750); + do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng); + eprintln!("Growing to 1500"); + writer.grow(1500).unwrap(); + do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng); + eprintln!("Shrinking to 200"); + while shadow.len() > 100 { + do_deletes(1, &mut writer, &mut shadow); + } + do_shrink(&mut writer, &mut shadow, 1500, 200); + do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng); + eprintln!("Growing to 10k"); + writer.grow(10000).unwrap(); + do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng); +} + +#[test] +fn test_bucket_ops() { + let writer = HashMapInit::::new_resizeable_named(1000, 1200, "test_bucket_ops") + .attach_writer(); + match writer.entry(1.into()) { + Entry::Occupied(mut e) => { + e.insert(2); + } + Entry::Vacant(e) => { + _ = e.insert(2).unwrap(); + } + } + assert_eq!(writer.get_num_buckets_in_use(), 1); + assert_eq!(writer.get_num_buckets(), 1000); + assert_eq!(*writer.get(&1.into()).unwrap(), 2); + let pos = match writer.entry(1.into()) { + Entry::Occupied(e) => { + assert_eq!(e._key, 1.into()); + e.bucket_pos as usize + } + Entry::Vacant(_) => { + panic!("Insert didn't affect entry"); + } + }; + assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into()); + assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2)); + { + let ptr: *const usize = &*writer.get(&1.into()).unwrap(); + assert_eq!(writer.get_bucket_for_value(ptr), pos); + } + writer.remove(&1.into()); + assert!(writer.get(&1.into()).is_none()); +} + +#[test] +fn test_shrink_zero() { + let mut writer = + HashMapInit::::new_resizeable_named(1500, 2000, "test_shrink_zero") + .attach_writer(); + writer.begin_shrink(0); + for i in 0..1500 { + writer.entry_at_bucket(i).map(|x| x.remove()); + } + writer.finish_shrink().unwrap(); + assert_eq!(writer.get_num_buckets_in_use(), 0); + let entry = writer.entry(1.into()); + if let Entry::Vacant(v) = entry { + assert!(v.insert(2).is_err()); + } else { + panic!("Somehow got non-vacant entry in empty map.") + } + writer.grow(50).unwrap(); + let entry = writer.entry(1.into()); + if let Entry::Vacant(v) = entry { + assert!(v.insert(2).is_ok()); + } else { + panic!("Somehow got non-vacant entry in empty map.") + } + assert_eq!(writer.get_num_buckets_in_use(), 1); +} + +#[test] +#[should_panic] +fn test_grow_oom() { + let writer = HashMapInit::::new_resizeable_named(1500, 2000, "test_grow_oom") + .attach_writer(); + writer.grow(20000).unwrap(); +} + +#[test] +#[should_panic] +fn test_shrink_bigger() { + let mut writer = + HashMapInit::::new_resizeable_named(1500, 2500, "test_shrink_bigger") + .attach_writer(); + writer.begin_shrink(2000); +} + +#[test] +#[should_panic] +fn test_shrink_early_finish() { + let writer = + HashMapInit::::new_resizeable_named(1500, 2500, "test_shrink_early_finish") + .attach_writer(); + writer.finish_shrink().unwrap(); +} + +#[test] +#[should_panic] +fn test_shrink_fixed_size() { + let mut area = [MaybeUninit::uninit(); 10000]; + let init_struct = HashMapInit::::with_fixed(3, &mut area); + let mut writer = init_struct.attach_writer(); + writer.begin_shrink(1); +} diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs index c689959b68..226cc0c22d 100644 --- a/libs/neon-shmem/src/lib.rs +++ b/libs/neon-shmem/src/lib.rs @@ -1,418 +1,3 @@ -//! Shared memory utilities for neon communicator - -use std::num::NonZeroUsize; -use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; -use std::ptr::NonNull; -use std::sync::atomic::{AtomicUsize, Ordering}; - -use nix::errno::Errno; -use nix::sys::mman::MapFlags; -use nix::sys::mman::ProtFlags; -use nix::sys::mman::mmap as nix_mmap; -use nix::sys::mman::munmap as nix_munmap; -use nix::unistd::ftruncate as nix_ftruncate; - -/// ShmemHandle represents a shared memory area that can be shared by processes over fork(). -/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's -/// specified at creation. -/// -/// The area is backed by an anonymous file created with memfd_create(). The full address space for -/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`], -/// the underlying file is resized. Do not access the area beyond the current size. Currently, that -/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the -/// future. -pub struct ShmemHandle { - /// memfd file descriptor - fd: OwnedFd, - - max_size: usize, - - // Pointer to the beginning of the shared memory area. The header is stored there. - shared_ptr: NonNull, - - // Pointer to the beginning of the user data - pub data_ptr: NonNull, -} - -/// This is stored at the beginning in the shared memory area. -struct SharedStruct { - max_size: usize, - - /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag - current_size: AtomicUsize, -} - -const RESIZE_IN_PROGRESS: usize = 1 << 63; - -const HEADER_SIZE: usize = std::mem::size_of::(); - -/// Error type returned by the ShmemHandle functions. -#[derive(thiserror::Error, Debug)] -#[error("{msg}: {errno}")] -pub struct Error { - pub msg: String, - pub errno: Errno, -} - -impl Error { - fn new(msg: &str, errno: Errno) -> Error { - Error { - msg: msg.to_string(), - errno, - } - } -} - -impl ShmemHandle { - /// Create a new shared memory area. To communicate between processes, the processes need to be - /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes. - /// - /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other - /// processes can continue using it, however. - pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { - // create the backing anonymous file. - let fd = create_backing_file(name)?; - - Self::new_with_fd(fd, initial_size, max_size) - } - - fn new_with_fd( - fd: OwnedFd, - initial_size: usize, - max_size: usize, - ) -> Result { - // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size - // is a little larger than this because of the SharedStruct header. Make the upper limit - // somewhat smaller than that, because with anything close to that, you'll run out of - // memory anyway. - if max_size >= 1 << 48 { - panic!("max size {max_size} too large"); - } - if initial_size > max_size { - panic!("initial size {initial_size} larger than max size {max_size}"); - } - - // The actual initial / max size is the one given by the caller, plus the size of - // 'SharedStruct'. - let initial_size = HEADER_SIZE + initial_size; - let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); - - // Reserve address space for it with mmap - // - // TODO: Use MAP_HUGETLB if possible - let start_ptr = unsafe { - nix_mmap( - None, - max_size, - ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, - MapFlags::MAP_SHARED, - &fd, - 0, - ) - } - .map_err(|e| Error::new("mmap failed: {e}", e))?; - - // Reserve space for the initial size - enlarge_file(fd.as_fd(), initial_size as u64)?; - - // Initialize the header - let shared: NonNull = start_ptr.cast(); - unsafe { - shared.write(SharedStruct { - max_size: max_size.into(), - current_size: AtomicUsize::new(initial_size), - }) - }; - - // The user data begins after the header - let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; - - Ok(ShmemHandle { - fd, - max_size: max_size.into(), - shared_ptr: shared, - data_ptr, - }) - } - - // return reference to the header - fn shared(&self) -> &SharedStruct { - unsafe { self.shared_ptr.as_ref() } - } - - /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified - /// when creating the area. - /// - /// This may only be called from one process/thread concurrently. We detect that case - /// and return an Error. - pub fn set_size(&self, new_size: usize) -> Result<(), Error> { - let new_size = new_size + HEADER_SIZE; - let shared = self.shared(); - - if new_size > self.max_size { - panic!( - "new size ({} is greater than max size ({})", - new_size, self.max_size - ); - } - assert_eq!(self.max_size, shared.max_size); - - // Lock the area by setting the bit in 'current_size' - // - // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory - // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But - // since this is not performance-critical, better safe than sorry . - let mut old_size = shared.current_size.load(Ordering::Acquire); - loop { - if (old_size & RESIZE_IN_PROGRESS) != 0 { - return Err(Error::new( - "concurrent resize detected", - Errno::UnknownErrno, - )); - } - match shared.current_size.compare_exchange( - old_size, - new_size, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => break, - Err(x) => old_size = x, - } - } - - // Ok, we got the lock. - // - // NB: If anything goes wrong, we *must* clear the bit! - let result = { - use std::cmp::Ordering::{Equal, Greater, Less}; - match new_size.cmp(&old_size) { - Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| { - Error::new("could not shrink shmem segment, ftruncate failed: {e}", e) - }), - Equal => Ok(()), - Greater => enlarge_file(self.fd.as_fd(), new_size as u64), - } - }; - - // Unlock - shared.current_size.store( - if result.is_ok() { new_size } else { old_size }, - Ordering::Release, - ); - - result - } - - /// Returns the current user-visible size of the shared memory segment. - /// - /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's - /// responsibility not to access the area beyond the current size. - pub fn current_size(&self) -> usize { - let total_current_size = - self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; - total_current_size - HEADER_SIZE - } -} - -impl Drop for ShmemHandle { - fn drop(&mut self) { - // SAFETY: The pointer was obtained from mmap() with the given size. - // We unmap the entire region. - let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; - // The fd is dropped automatically by OwnedFd. - } -} - -/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an -/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for -/// development and testing, but in production we want the file to stay in memory. -/// -/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused. -#[allow(unused_variables)] -fn create_backing_file(name: &str) -> Result { - #[cfg(not(target_os = "macos"))] - { - nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) - .map_err(|e| Error::new("memfd_create failed: {e}", e)) - } - #[cfg(target_os = "macos")] - { - let file = tempfile::tempfile().map_err(|e| { - Error::new( - "could not create temporary file to back shmem area: {e}", - nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), - ) - })?; - Ok(OwnedFd::from(file)) - } -} - -fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { - // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that - // we don't get a segfault later when trying to actually use it. - #[cfg(not(target_os = "macos"))] - { - nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| { - Error::new( - "could not grow shmem segment, posix_fallocate failed: {e}", - e, - ) - }) - } - // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' - #[cfg(target_os = "macos")] - { - nix::unistd::ftruncate(fd, size as i64) - .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use nix::unistd::ForkResult; - use std::ops::Range; - - /// check that all bytes in given range have the expected value. - fn assert_range(ptr: *const u8, expected: u8, range: Range) { - for i in range { - let b = unsafe { *(ptr.add(i)) }; - assert_eq!(expected, b, "unexpected byte at offset {i}"); - } - } - - /// Write 'b' to all bytes in the given range - fn write_range(ptr: *mut u8, b: u8, range: Range) { - unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; - } - - // simple single-process test of growing and shrinking - #[test] - fn test_shmem_resize() -> Result<(), Error> { - let max_size = 1024 * 1024; - let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; - - assert_eq!(init_struct.current_size(), 0); - - // Initial grow - let size1 = 10000; - init_struct.set_size(size1).unwrap(); - assert_eq!(init_struct.current_size(), size1); - - // Write some data - let data_ptr = init_struct.data_ptr.as_ptr(); - write_range(data_ptr, 0xAA, 0..size1); - assert_range(data_ptr, 0xAA, 0..size1); - - // Shrink - let size2 = 5000; - init_struct.set_size(size2).unwrap(); - assert_eq!(init_struct.current_size(), size2); - - // Grow again - let size3 = 20000; - init_struct.set_size(size3).unwrap(); - assert_eq!(init_struct.current_size(), size3); - - // Try to read it. The area that was shrunk and grown again should read as all zeros now - assert_range(data_ptr, 0xAA, 0..5000); - assert_range(data_ptr, 0, 5000..size1); - - // Try to grow beyond max_size - //let size4 = max_size + 1; - //assert!(init_struct.set_size(size4).is_err()); - - // Dropping init_struct should unmap the memory - drop(init_struct); - - Ok(()) - } - - /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier, - /// but is stored in the shared memory area and works across processes. It's implemented by - /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. - struct SimpleBarrier { - num_procs: usize, - count: AtomicUsize, - } - - impl SimpleBarrier { - unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { - unsafe { - *ptr = SimpleBarrier { - num_procs, - count: AtomicUsize::new(0), - } - } - } - - pub fn wait(&self) { - let old = self.count.fetch_add(1, Ordering::Relaxed); - - let generation = old / self.num_procs; - - let mut current = old + 1; - while current < (generation + 1) * self.num_procs { - std::thread::sleep(std::time::Duration::from_millis(10)); - current = self.count.load(Ordering::Relaxed); - } - } - } - - #[test] - fn test_multi_process() { - // Initialize - let max_size = 1_000_000_000_000; - let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); - let ptr = init_struct.data_ptr.as_ptr(); - - // Store the SimpleBarrier in the first 1k of the area. - init_struct.set_size(10000).unwrap(); - let barrier_ptr: *mut SimpleBarrier = unsafe { - ptr.add(ptr.align_offset(std::mem::align_of::())) - .cast() - }; - unsafe { SimpleBarrier::init(barrier_ptr, 2) }; - let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; - - // Fork another test process. The code after this runs in both processes concurrently. - let fork_result = unsafe { nix::unistd::fork().unwrap() }; - - // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 - if fork_result.is_parent() { - write_range(ptr, 0xAA, 1000..2000); - } else { - write_range(ptr, 0xBB, 2000..3000); - } - barrier.wait(); - // Verify the contents. (in both processes) - assert_range(ptr, 0xAA, 1000..2000); - assert_range(ptr, 0xBB, 2000..3000); - - // Grow, from the child this time - let size = 10_000_000; - if !fork_result.is_parent() { - init_struct.set_size(size).unwrap(); - } - barrier.wait(); - - // make some writes at the end - if fork_result.is_parent() { - write_range(ptr, 0xAA, (size - 10)..size); - } else { - write_range(ptr, 0xBB, (size - 20)..(size - 10)); - } - barrier.wait(); - - // Verify the contents. (This runs in both processes) - assert_range(ptr, 0, (size - 1000)..(size - 20)); - assert_range(ptr, 0xBB, (size - 20)..(size - 10)); - assert_range(ptr, 0xAA, (size - 10)..size); - - if let ForkResult::Parent { child } = fork_result { - nix::sys::wait::waitpid(child, None).unwrap(); - } - } -} +pub mod hash; +pub mod shmem; +pub mod sync; diff --git a/libs/neon-shmem/src/shmem.rs b/libs/neon-shmem/src/shmem.rs new file mode 100644 index 0000000000..f19f402859 --- /dev/null +++ b/libs/neon-shmem/src/shmem.rs @@ -0,0 +1,409 @@ +//! Dynamically resizable contiguous chunk of shared memory + +use std::num::NonZeroUsize; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use nix::errno::Errno; +use nix::sys::mman::MapFlags; +use nix::sys::mman::ProtFlags; +use nix::sys::mman::mmap as nix_mmap; +use nix::sys::mman::munmap as nix_munmap; +use nix::unistd::ftruncate as nix_ftruncate; + +/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`. +/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's +/// specified at creation. +/// +/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for +/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`], +/// the underlying file is resized. Do not access the area beyond the current size. Currently, that +/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the +/// future. +pub struct ShmemHandle { + /// memfd file descriptor + fd: OwnedFd, + + max_size: usize, + + // Pointer to the beginning of the shared memory area. The header is stored there. + shared_ptr: NonNull, + + // Pointer to the beginning of the user data + pub data_ptr: NonNull, +} + +/// This is stored at the beginning in the shared memory area. +struct SharedStruct { + max_size: usize, + + /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag. + current_size: AtomicUsize, +} + +const RESIZE_IN_PROGRESS: usize = 1 << 63; + +const HEADER_SIZE: usize = std::mem::size_of::(); + +/// Error type returned by the [`ShmemHandle`] functions. +#[derive(thiserror::Error, Debug)] +#[error("{msg}: {errno}")] +pub struct Error { + pub msg: String, + pub errno: Errno, +} + +impl Error { + fn new(msg: &str, errno: Errno) -> Self { + Self { + msg: msg.to_string(), + errno, + } + } +} + +impl ShmemHandle { + /// Create a new shared memory area. To communicate between processes, the processes need to be + /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes. + /// + /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other + /// processes can continue using it, however. + pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { + // create the backing anonymous file. + let fd = create_backing_file(name)?; + + Self::new_with_fd(fd, initial_size, max_size) + } + + fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result { + // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size + // is a little larger than this because of the SharedStruct header. Make the upper limit + // somewhat smaller than that, because with anything close to that, you'll run out of + // memory anyway. + assert!(max_size < 1 << 48, "max size {max_size} too large"); + + assert!( + initial_size <= max_size, + "initial size {initial_size} larger than max size {max_size}" + ); + + // The actual initial / max size is the one given by the caller, plus the size of + // 'SharedStruct'. + let initial_size = HEADER_SIZE + initial_size; + let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); + + // Reserve address space for it with mmap + // + // TODO: Use MAP_HUGETLB if possible + let start_ptr = unsafe { + nix_mmap( + None, + max_size, + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, + MapFlags::MAP_SHARED, + &fd, + 0, + ) + } + .map_err(|e| Error::new("mmap failed", e))?; + + // Reserve space for the initial size + enlarge_file(fd.as_fd(), initial_size as u64)?; + + // Initialize the header + let shared: NonNull = start_ptr.cast(); + unsafe { + shared.write(SharedStruct { + max_size: max_size.into(), + current_size: AtomicUsize::new(initial_size), + }); + } + + // The user data begins after the header + let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; + + Ok(Self { + fd, + max_size: max_size.into(), + shared_ptr: shared, + data_ptr, + }) + } + + // return reference to the header + fn shared(&self) -> &SharedStruct { + unsafe { self.shared_ptr.as_ref() } + } + + /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified + /// when creating the area. + /// + /// This may only be called from one process/thread concurrently. We detect that case + /// and return an [`shmem::Error`](Error). + pub fn set_size(&self, new_size: usize) -> Result<(), Error> { + let new_size = new_size + HEADER_SIZE; + let shared = self.shared(); + + assert!( + new_size <= self.max_size, + "new size ({new_size}) is greater than max size ({})", + self.max_size + ); + + assert_eq!(self.max_size, shared.max_size); + + // Lock the area by setting the bit in `current_size` + // + // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory + // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But + // since this is not performance-critical, better safe than sorry. + let mut old_size = shared.current_size.load(Ordering::Acquire); + loop { + if (old_size & RESIZE_IN_PROGRESS) != 0 { + return Err(Error::new( + "concurrent resize detected", + Errno::UnknownErrno, + )); + } + match shared.current_size.compare_exchange( + old_size, + new_size, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => old_size = x, + } + } + + // Ok, we got the lock. + // + // NB: If anything goes wrong, we *must* clear the bit! + let result = { + use std::cmp::Ordering::{Equal, Greater, Less}; + match new_size.cmp(&old_size) { + Less => nix_ftruncate(&self.fd, new_size as i64) + .map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)), + Equal => Ok(()), + Greater => enlarge_file(self.fd.as_fd(), new_size as u64), + } + }; + + // Unlock + shared.current_size.store( + if result.is_ok() { new_size } else { old_size }, + Ordering::Release, + ); + + result + } + + /// Returns the current user-visible size of the shared memory segment. + /// + /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time. + /// It is the caller's responsibility not to access the area beyond the current size. + pub fn current_size(&self) -> usize { + let total_current_size = + self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; + total_current_size - HEADER_SIZE + } +} + +impl Drop for ShmemHandle { + fn drop(&mut self) { + // SAFETY: The pointer was obtained from mmap() with the given size. + // We unmap the entire region. + let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; + // The fd is dropped automatically by OwnedFd. + } +} + +/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an +/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for +/// development and testing, but in production we want the file to stay in memory. +/// +/// Disable unused variables warnings because `name` is unused in the macos path. +#[allow(unused_variables)] +fn create_backing_file(name: &str) -> Result { + #[cfg(not(target_os = "macos"))] + { + nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) + .map_err(|e| Error::new("memfd_create failed", e)) + } + #[cfg(target_os = "macos")] + { + let file = tempfile::tempfile().map_err(|e| { + Error::new( + "could not create temporary file to back shmem area", + nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), + ) + })?; + Ok(OwnedFd::from(file)) + } +} + +fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { + // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that + // we don't get a segfault later when trying to actually use it. + #[cfg(not(target_os = "macos"))] + { + nix::fcntl::posix_fallocate(fd, 0, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e)) + } + // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' + #[cfg(target_os = "macos")] + { + nix::unistd::ftruncate(fd, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use nix::unistd::ForkResult; + use std::ops::Range; + + /// check that all bytes in given range have the expected value. + fn assert_range(ptr: *const u8, expected: u8, range: Range) { + for i in range { + let b = unsafe { *(ptr.add(i)) }; + assert_eq!(expected, b, "unexpected byte at offset {i}"); + } + } + + /// Write 'b' to all bytes in the given range + fn write_range(ptr: *mut u8, b: u8, range: Range) { + unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; + } + + // simple single-process test of growing and shrinking + #[test] + fn test_shmem_resize() -> Result<(), Error> { + let max_size = 1024 * 1024; + let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; + + assert_eq!(init_struct.current_size(), 0); + + // Initial grow + let size1 = 10000; + init_struct.set_size(size1).unwrap(); + assert_eq!(init_struct.current_size(), size1); + + // Write some data + let data_ptr = init_struct.data_ptr.as_ptr(); + write_range(data_ptr, 0xAA, 0..size1); + assert_range(data_ptr, 0xAA, 0..size1); + + // Shrink + let size2 = 5000; + init_struct.set_size(size2).unwrap(); + assert_eq!(init_struct.current_size(), size2); + + // Grow again + let size3 = 20000; + init_struct.set_size(size3).unwrap(); + assert_eq!(init_struct.current_size(), size3); + + // Try to read it. The area that was shrunk and grown again should read as all zeros now + assert_range(data_ptr, 0xAA, 0..5000); + assert_range(data_ptr, 0, 5000..size1); + + // Try to grow beyond max_size + //let size4 = max_size + 1; + //assert!(init_struct.set_size(size4).is_err()); + + // Dropping init_struct should unmap the memory + drop(init_struct); + + Ok(()) + } + + /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`, + /// but is stored in the shared memory area and works across processes. It's implemented by + /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. + struct SimpleBarrier { + num_procs: usize, + count: AtomicUsize, + } + + impl SimpleBarrier { + unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { + unsafe { + *ptr = SimpleBarrier { + num_procs, + count: AtomicUsize::new(0), + } + } + } + + pub fn wait(&self) { + let old = self.count.fetch_add(1, Ordering::Relaxed); + + let generation = old / self.num_procs; + + let mut current = old + 1; + while current < (generation + 1) * self.num_procs { + std::thread::sleep(std::time::Duration::from_millis(10)); + current = self.count.load(Ordering::Relaxed); + } + } + } + + #[test] + fn test_multi_process() { + // Initialize + let max_size = 1_000_000_000_000; + let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); + let ptr = init_struct.data_ptr.as_ptr(); + + // Store the SimpleBarrier in the first 1k of the area. + init_struct.set_size(10000).unwrap(); + let barrier_ptr: *mut SimpleBarrier = unsafe { + ptr.add(ptr.align_offset(std::mem::align_of::())) + .cast() + }; + unsafe { SimpleBarrier::init(barrier_ptr, 2) }; + let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; + + // Fork another test process. The code after this runs in both processes concurrently. + let fork_result = unsafe { nix::unistd::fork().unwrap() }; + + // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 + if fork_result.is_parent() { + write_range(ptr, 0xAA, 1000..2000); + } else { + write_range(ptr, 0xBB, 2000..3000); + } + barrier.wait(); + // Verify the contents. (in both processes) + assert_range(ptr, 0xAA, 1000..2000); + assert_range(ptr, 0xBB, 2000..3000); + + // Grow, from the child this time + let size = 10_000_000; + if !fork_result.is_parent() { + init_struct.set_size(size).unwrap(); + } + barrier.wait(); + + // make some writes at the end + if fork_result.is_parent() { + write_range(ptr, 0xAA, (size - 10)..size); + } else { + write_range(ptr, 0xBB, (size - 20)..(size - 10)); + } + barrier.wait(); + + // Verify the contents. (This runs in both processes) + assert_range(ptr, 0, (size - 1000)..(size - 20)); + assert_range(ptr, 0xBB, (size - 20)..(size - 10)); + assert_range(ptr, 0xAA, (size - 10)..size); + + if let ForkResult::Parent { child } = fork_result { + nix::sys::wait::waitpid(child, None).unwrap(); + } + } +} diff --git a/libs/neon-shmem/src/sync.rs b/libs/neon-shmem/src/sync.rs new file mode 100644 index 0000000000..95719778ba --- /dev/null +++ b/libs/neon-shmem/src/sync.rs @@ -0,0 +1,111 @@ +//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory. + +use std::mem::MaybeUninit; +use std::ptr::NonNull; + +use nix::errno::Errno; + +pub type RwLock = lock_api::RwLock; +pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>; +pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>; +pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>; +pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>; + +/// Shared memory read-write lock. +pub struct PthreadRwLock(Option>); + +/// Simple macro that calls a function in the libc namespace and panics if return value is nonzero. +macro_rules! libc_checked { + ($fn_name:ident ( $($arg:expr),* )) => {{ + let res = libc::$fn_name($($arg),*); + if res != 0 { + panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res)); + } + }}; +} + +impl PthreadRwLock { + /// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock. + /// + /// # Safety + /// `lock` must be non-null. Every unsafe operation will panic in the event of an error. + pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self { + unsafe { + let mut attrs = MaybeUninit::uninit(); + libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr())); + libc_checked!(pthread_rwlockattr_setpshared( + attrs.as_mut_ptr(), + libc::PTHREAD_PROCESS_SHARED + )); + libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr())); + // Safety: POSIX specifies that "any function affecting the attributes + // object (including destruction) shall not affect any previously + // initialized read-write locks". + libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr())); + Self(Some(NonNull::new_unchecked(lock))) + } + } + + fn inner(&self) -> NonNull { + match self.0 { + None => { + panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT") + } + Some(x) => x, + } + } +} + +unsafe impl lock_api::RawRwLock for PthreadRwLock { + type GuardMarker = lock_api::GuardSend; + const INIT: Self = Self(None); + + fn try_lock_shared(&self) -> bool { + unsafe { + let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr()); + match res { + 0 => true, + libc::EAGAIN => false, + _ => panic!( + "pthread_rwlock_tryrdlock failed with {}", + Errno::from_raw(res) + ), + } + } + } + + fn try_lock_exclusive(&self) -> bool { + unsafe { + let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr()); + match res { + 0 => true, + libc::EAGAIN => false, + _ => panic!("try_wrlock failed with {}", Errno::from_raw(res)), + } + } + } + + fn lock_shared(&self) { + unsafe { + libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr())); + } + } + + fn lock_exclusive(&self) { + unsafe { + libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr())); + } + } + + unsafe fn unlock_exclusive(&self) { + unsafe { + libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr())); + } + } + + unsafe fn unlock_shared(&self) { + unsafe { + libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr())); + } + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index f01c65d1bd..2a8d05f51e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -394,7 +394,7 @@ impl From<&OtelExporterConfig> for tracing_utils::ExportConfig { tracing_utils::ExportConfig { endpoint: Some(val.endpoint.clone()), protocol: val.protocol.into(), - timeout: val.timeout, + timeout: Some(val.timeout), } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 8f86b03f72..1248be0b5c 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -596,6 +596,7 @@ pub struct TimelineImportRequest { pub timeline_id: TimelineId, pub start_lsn: Lsn, pub sk_set: Vec, + pub force_upsert: bool, } #[derive(serde::Serialize, serde::Deserialize, Clone)] diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 102bbee879..4e8fabfa72 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -981,12 +981,12 @@ mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { - field1: rng.r#gen(), - field2: rng.r#gen(), - field3: rng.r#gen(), - field4: rng.r#gen(), - field5: rng.r#gen(), - field6: rng.r#gen(), + field1: rng.random(), + field2: rng.random(), + field3: rng.random(), + field4: rng.random(), + field5: rng.random(), + field6: rng.random(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 11e02a8550..7c7c65fb70 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -443,9 +443,9 @@ pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { use rand::Rng; - use rand::distributions::Alphanumeric; + use rand::distr::Alphanumeric; Self( - rand::thread_rng() + rand::rng() .sample_iter(&Alphanumeric) .take(20) .map(char::from) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index d6f4cd5e66..74f5f14f87 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -69,22 +69,6 @@ impl Hash for ShardIdentity { } } -/// Stripe size in number of pages -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardStripeSize(pub u32); - -impl Default for ShardStripeSize { - fn default() -> Self { - DEFAULT_STRIPE_SIZE - } -} - -impl std::fmt::Display for ShardStripeSize { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } -} - /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 07cada2eb1..fa2c896edb 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -21,6 +21,14 @@ pub struct ReAttachRequest { /// if the node already has a node_id set. #[serde(skip_serializing_if = "Option::is_none", default)] pub register: Option, + + /// Hadron: Optional flag to indicate whether the node is starting with an empty local disk. + /// Will be set to true if the node couldn't find any local tenant data on startup, could be + /// due to the node starting for the first time or due to a local SSD failure/disk wipe event. + /// The flag may be used by the storage controller to update its observed state of the world + /// to make sure that it sends explicit location_config calls to the node following the + /// re-attach request. + pub empty_local_disk: Option, } #[derive(Serialize, Deserialize, Debug)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 851d824291..20afa8bb46 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -749,7 +749,18 @@ impl PostgresBackend { trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { match e { - QueryError::Shutdown => return Ok(ProcessMsgResult::Break), + err @ QueryError::Shutdown => { + // Notify postgres of the connection shutdown at the libpq + // protocol level. This avoids postgres having to tell apart + // from an idle connection and a stale one, which is bug prone. + let shutdown_error = short_error(&err); + self.write_message_noflush(&BeMessage::ErrorResponse( + &shutdown_error, + Some(err.pg_error_code()), + ))?; + + return Ok(ProcessMsgResult::Break); + } QueryError::SimulatedConnectionError => { return Err(QueryError::SimulatedConnectionError); } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index cdebd43f6f..190d9a78c4 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -110,7 +110,6 @@ fn main() -> anyhow::Result<()> { .allowlist_type("XLogRecPtr") .allowlist_type("XLogSegNo") .allowlist_type("TimeLineID") - .allowlist_type("TimestampTz") .allowlist_type("MultiXactId") .allowlist_type("MultiXactOffset") .allowlist_type("MultiXactStatus") diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 9297ac46c9..a88b520a41 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -227,8 +227,7 @@ pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{ BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData, - RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, - uint64, + RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64, }; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index d593123dc0..7ed07b0e77 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -4,13 +4,14 @@ //! TODO: Generate separate types for each supported PG version use bytes::{Buf, Bytes}; +use postgres_ffi_types::TimestampTz; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; use crate::{ BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion, - RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, + RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, }; #[repr(C)] @@ -863,7 +864,8 @@ pub mod v17 { XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, rm_neon, }; - pub use crate::{TimeLineID, TimestampTz}; + pub use crate::TimeLineID; + pub use postgres_ffi_types::TimestampTz; #[repr(C)] #[derive(Debug)] diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index f7b6296053..134baf5ff7 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -9,10 +9,11 @@ use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ - CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, MY_PGVERSION }; +use postgres_ffi_types::TimestampTz; use super::wal_generator::LogicalMessageGenerator; use crate::pg_constants; use crate::PG_TLI; diff --git a/libs/postgres_ffi_types/src/lib.rs b/libs/postgres_ffi_types/src/lib.rs index 84ef499b9f..86e8259e8a 100644 --- a/libs/postgres_ffi_types/src/lib.rs +++ b/libs/postgres_ffi_types/src/lib.rs @@ -11,3 +11,4 @@ pub mod forknum; pub type Oid = u32; pub type RepOriginId = u16; +pub type TimestampTz = i64; diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 482dd9a298..5ecb4badf1 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -203,12 +203,12 @@ impl fmt::Display for CancelKeyData { } } -use rand::distributions::{Distribution, Standard}; -impl Distribution for Standard { +use rand::distr::{Distribution, StandardUniform}; +impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> CancelKeyData { CancelKeyData { - backend_pid: rng.r#gen(), - cancel_key: rng.r#gen(), + backend_pid: rng.random(), + cancel_key: rng.random(), } } } diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index 274c81c500..cfa59a34f4 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -155,10 +155,10 @@ pub struct ScramSha256 { fn nonce() -> String { // rand 0.5's ThreadRng is cryptographically secure - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); (0..NONCE_LENGTH) .map(|_| { - let mut v = rng.gen_range(0x21u8..0x7e); + let mut v = rng.random_range(0x21u8..0x7e); if v == 0x2c { v = 0x7e } diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 3fc9a9335c..b1728ef37d 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -74,7 +74,6 @@ impl Header { } /// An enum representing Postgres backend messages. -#[non_exhaustive] pub enum Message { AuthenticationCleartextPassword, AuthenticationGss, @@ -145,16 +144,7 @@ impl Message { PARSE_COMPLETE_TAG => Message::ParseComplete, BIND_COMPLETE_TAG => Message::BindComplete, CLOSE_COMPLETE_TAG => Message::CloseComplete, - NOTIFICATION_RESPONSE_TAG => { - let process_id = buf.read_i32::()?; - let channel = buf.read_cstr()?; - let message = buf.read_cstr()?; - Message::NotificationResponse(NotificationResponseBody { - process_id, - channel, - message, - }) - } + NOTIFICATION_RESPONSE_TAG => Message::NotificationResponse(NotificationResponseBody {}), COPY_DONE_TAG => Message::CopyDone, COMMAND_COMPLETE_TAG => { let tag = buf.read_cstr()?; @@ -543,28 +533,7 @@ impl NoticeResponseBody { } } -pub struct NotificationResponseBody { - process_id: i32, - channel: Bytes, - message: Bytes, -} - -impl NotificationResponseBody { - #[inline] - pub fn process_id(&self) -> i32 { - self.process_id - } - - #[inline] - pub fn channel(&self) -> io::Result<&str> { - get_str(&self.channel) - } - - #[inline] - pub fn message(&self) -> io::Result<&str> { - get_str(&self.message) - } -} +pub struct NotificationResponseBody {} pub struct ParameterDescriptionBody { storage: Bytes, diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index e00ca1e34c..8926710225 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -28,7 +28,7 @@ const SCRAM_DEFAULT_SALT_LEN: usize = 16; /// special characters that would require escaping in an SQL command. pub async fn scram_sha_256(password: &[u8]) -> String { let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN]; - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); rng.fill_bytes(&mut salt); scram_sha_256_salt(password, salt).await } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 828884ffd8..068566e955 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; use crate::cancel_token::RawCancelToken; -use crate::codec::{BackendMessages, FrontendMessage}; +use crate::codec::{BackendMessages, FrontendMessage, RecordNotices}; use crate::config::{Host, SslMode}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; @@ -221,6 +221,18 @@ impl Client { &mut self.inner } + pub fn record_notices(&mut self, limit: usize) -> mpsc::UnboundedReceiver> { + let (tx, rx) = mpsc::unbounded_channel(); + + let notices = RecordNotices { sender: tx, limit }; + self.inner + .sender + .send(FrontendMessage::RecordNotices(notices)) + .ok(); + + rx + } + /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip pub async fn query_raw_txt( diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index daa5371426..813faa0e35 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -3,10 +3,17 @@ use std::io; use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; +use tokio::sync::mpsc::UnboundedSender; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { Raw(Bytes), + RecordNotices(RecordNotices), +} + +pub struct RecordNotices { + pub sender: UnboundedSender>, + pub limit: usize, } pub enum BackendMessage { @@ -33,14 +40,11 @@ impl FallibleIterator for BackendMessages { pub struct PostgresCodec; -impl Encoder for PostgresCodec { +impl Encoder for PostgresCodec { type Error = io::Error; - fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> { - match item { - FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf), - } - + fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> io::Result<()> { + dst.extend_from_slice(&item); Ok(()) } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 4a07eccf9a..2f718e1e7d 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,11 +1,9 @@ use std::net::IpAddr; -use postgres_protocol2::message::backend::Message; use tokio::net::TcpStream; use tokio::sync::mpsc; use crate::client::SocketConfig; -use crate::codec::BackendMessage; use crate::config::Host; use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; @@ -48,8 +46,8 @@ where let stream = connect_tls(socket, config.ssl_mode, tls).await?; let RawConnection { stream, - parameters, - delayed_notice, + parameters: _, + delayed_notice: _, process_id, secret_key, } = connect_raw(stream, config).await?; @@ -72,13 +70,7 @@ where secret_key, ); - // delayed notices are always sent as "Async" messages. - let delayed = delayed_notice - .into_iter() - .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) - .collect(); - - let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx); + let connection = Connection::new(stream, conn_tx, conn_rx); Ok((client, connection)) } diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index b89a600a2e..462e1be1aa 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -3,7 +3,7 @@ use std::io; use std::pin::Pin; use std::task::{Context, Poll}; -use bytes::BytesMut; +use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready}; use postgres_protocol2::authentication::sasl; @@ -14,7 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::codec::Framed; use crate::Error; -use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::codec::{BackendMessage, BackendMessages, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::TlsStream; @@ -25,7 +25,7 @@ pub struct StartupStream { delayed_notice: Vec, } -impl Sink for StartupStream +impl Sink for StartupStream where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, @@ -36,7 +36,7 @@ where Pin::new(&mut self.inner).poll_ready(cx) } - fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> { + fn start_send(mut self: Pin<&mut Self>, item: Bytes) -> io::Result<()> { Pin::new(&mut self.inner).start_send(item) } @@ -120,10 +120,7 @@ where let mut buf = BytesMut::new(); frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?; - stream - .send(FrontendMessage::Raw(buf.freeze())) - .await - .map_err(Error::io) + stream.send(buf.freeze()).await.map_err(Error::io) } async fn authenticate(stream: &mut StartupStream, config: &Config) -> Result<(), Error> @@ -191,10 +188,7 @@ where let mut buf = BytesMut::new(); frontend::password_message(password, &mut buf).map_err(Error::encode)?; - stream - .send(FrontendMessage::Raw(buf.freeze())) - .await - .map_err(Error::io) + stream.send(buf.freeze()).await.map_err(Error::io) } async fn authenticate_sasl( @@ -253,10 +247,7 @@ where let mut buf = BytesMut::new(); frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?; - stream - .send(FrontendMessage::Raw(buf.freeze())) - .await - .map_err(Error::io)?; + stream.send(buf.freeze()).await.map_err(Error::io)?; let body = match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationSaslContinue(body)) => body, @@ -272,10 +263,7 @@ where let mut buf = BytesMut::new(); frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?; - stream - .send(FrontendMessage::Raw(buf.freeze())) - .await - .map_err(Error::io)?; + stream.send(buf.freeze()).await.map_err(Error::io)?; let body = match stream.try_next().await.map_err(Error::io)? { Some(Message::AuthenticationSaslFinal(body)) => body, diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index fe0372b266..c43a22ffe7 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -1,22 +1,23 @@ -use std::collections::{HashMap, VecDeque}; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use bytes::BytesMut; -use futures_util::{Sink, Stream, ready}; -use postgres_protocol2::message::backend::Message; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, StreamExt, ready}; +use postgres_protocol2::message::backend::{Message, NoticeResponseBody}; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; -use tracing::{info, trace}; +use tracing::trace; -use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; -use crate::error::DbError; +use crate::Error; +use crate::codec::{ + BackendMessage, BackendMessages, FrontendMessage, PostgresCodec, RecordNotices, +}; use crate::maybe_tls_stream::MaybeTlsStream; -use crate::{AsyncMessage, Error, Notification}; #[derive(PartialEq, Debug)] enum State { @@ -33,18 +34,18 @@ enum State { /// occurred, or because its associated `Client` has dropped and all outstanding work has completed. #[must_use = "futures do nothing unless polled"] pub struct Connection { - /// HACK: we need this in the Neon Proxy. - pub stream: Framed, PostgresCodec>, - /// HACK: we need this in the Neon Proxy to forward params. - pub parameters: HashMap, + stream: Framed, PostgresCodec>, sender: PollSender, receiver: mpsc::UnboundedReceiver, + notices: Option, - pending_responses: VecDeque, + pending_response: Option, state: State, } +pub enum Never {} + impl Connection where S: AsyncRead + AsyncWrite + Unpin, @@ -52,70 +53,42 @@ where { pub(crate) fn new( stream: Framed, PostgresCodec>, - pending_responses: VecDeque, - parameters: HashMap, sender: mpsc::Sender, receiver: mpsc::UnboundedReceiver, ) -> Connection { Connection { stream, - parameters, sender: PollSender::new(sender), receiver, - pending_responses, + notices: None, + pending_response: None, state: State::Active, } } - fn poll_response( - &mut self, - cx: &mut Context<'_>, - ) -> Poll>> { - if let Some(message) = self.pending_responses.pop_front() { - trace!("retrying pending response"); - return Poll::Ready(Some(Ok(message))); - } - - Pin::new(&mut self.stream) - .poll_next(cx) - .map(|o| o.map(|r| r.map_err(Error::io))) - } - /// Read and process messages from the connection to postgres. /// client <- postgres - fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll> { + fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll> { loop { - let message = match self.poll_response(cx)? { - Poll::Ready(Some(message)) => message, - Poll::Ready(None) => return Poll::Ready(Err(Error::closed())), - Poll::Pending => { - trace!("poll_read: waiting on response"); - return Poll::Pending; - } - }; - - let messages = match message { - BackendMessage::Async(Message::NoticeResponse(body)) => { - let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; - return Poll::Ready(Ok(AsyncMessage::Notice(error))); - } - BackendMessage::Async(Message::NotificationResponse(body)) => { - let notification = Notification { - process_id: body.process_id(), - channel: body.channel().map_err(Error::parse)?.to_string(), - payload: body.message().map_err(Error::parse)?.to_string(), + let messages = match self.pending_response.take() { + Some(messages) => messages, + None => { + let message = match self.stream.poll_next_unpin(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(None) => return Poll::Ready(Err(Error::closed())), + Poll::Ready(Some(Err(e))) => return Poll::Ready(Err(Error::io(e))), + Poll::Ready(Some(Ok(message))) => message, }; - return Poll::Ready(Ok(AsyncMessage::Notification(notification))); + + match message { + BackendMessage::Async(Message::NoticeResponse(body)) => { + self.handle_notice(body)?; + continue; + } + BackendMessage::Async(_) => continue, + BackendMessage::Normal { messages } => messages, + } } - BackendMessage::Async(Message::ParameterStatus(body)) => { - self.parameters.insert( - body.name().map_err(Error::parse)?.to_string(), - body.value().map_err(Error::parse)?.to_string(), - ); - continue; - } - BackendMessage::Async(_) => unreachable!(), - BackendMessage::Normal { messages } => messages, }; match self.sender.poll_reserve(cx) { @@ -126,8 +99,7 @@ where return Poll::Ready(Err(Error::closed())); } Poll::Pending => { - self.pending_responses - .push_back(BackendMessage::Normal { messages }); + self.pending_response = Some(messages); trace!("poll_read: waiting on sender"); return Poll::Pending; } @@ -135,6 +107,31 @@ where } } + fn handle_notice(&mut self, body: NoticeResponseBody) -> Result<(), Error> { + let Some(notices) = &mut self.notices else { + return Ok(()); + }; + + let mut fields = body.fields(); + while let Some(field) = fields.next().map_err(Error::parse)? { + // loop until we find the message field + if field.type_() == b'M' { + // if the message field is within the limit, send it. + if let Some(new_limit) = notices.limit.checked_sub(field.value().len()) { + match notices.sender.send(field.value().into()) { + // set the new limit. + Ok(()) => notices.limit = new_limit, + // closed. + Err(_) => self.notices = None, + } + } + break; + } + } + + Ok(()) + } + /// Fetch the next client request and enqueue the response sender. fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { if self.receiver.is_closed() { @@ -168,21 +165,23 @@ where match self.poll_request(cx) { // send the message to postgres - Poll::Ready(Some(request)) => { + Poll::Ready(Some(FrontendMessage::Raw(request))) => { Pin::new(&mut self.stream) .start_send(request) .map_err(Error::io)?; } + Poll::Ready(Some(FrontendMessage::RecordNotices(notices))) => { + self.notices = Some(notices) + } // No more messages from the client, and no more responses to wait for. // Send a terminate message to postgres Poll::Ready(None) => { trace!("poll_write: at eof, terminating"); let mut request = BytesMut::new(); frontend::terminate(&mut request); - let request = FrontendMessage::Raw(request.freeze()); Pin::new(&mut self.stream) - .start_send(request) + .start_send(request.freeze()) .map_err(Error::io)?; trace!("poll_write: sent eof, closing"); @@ -231,34 +230,17 @@ where } } - /// Returns the value of a runtime parameter for this connection. - pub fn parameter(&self, name: &str) -> Option<&str> { - self.parameters.get(name).map(|s| &**s) - } - - /// Polls for asynchronous messages from the server. - /// - /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to - /// examine those messages should use this method to drive the connection rather than its `Future` implementation. - pub fn poll_message( - &mut self, - cx: &mut Context<'_>, - ) -> Poll>> { + fn poll_message(&mut self, cx: &mut Context<'_>) -> Poll>> { if self.state != State::Closing { // if the state is still active, try read from and write to postgres. - let message = self.poll_read(cx)?; - let closing = self.poll_write(cx)?; - if let Poll::Ready(()) = closing { + let Poll::Pending = self.poll_read(cx)?; + if self.poll_write(cx)?.is_ready() { self.state = State::Closing; } - if let Poll::Ready(message) = message { - return Poll::Ready(Some(Ok(message))); - } - // poll_read returned Pending. - // poll_write returned Pending or Ready(WriteReady::WaitingOnRead). - // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres. + // poll_write returned Pending or Ready(()). + // if poll_write returned Ready(()), then we are waiting to read more data from postgres. if self.state != State::Closing { return Poll::Pending; } @@ -280,11 +262,9 @@ where type Output = Result<(), Error>; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - while let Some(message) = ready!(self.poll_message(cx)?) { - if let AsyncMessage::Notice(notice) = message { - info!("{}: {}", notice.severity(), notice.message()); - } + match self.poll_message(cx)? { + Poll::Ready(None) => Poll::Ready(Ok(())), + Poll::Pending => Poll::Pending, } - Poll::Ready(Ok(())) } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 791c93b972..e3dd6d9261 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -8,7 +8,6 @@ pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; pub use crate::connect_raw::RawConnection; pub use crate::connection::Connection; -use crate::error::DbError; pub use crate::error::Error; pub use crate::generic_client::GenericClient; pub use crate::query::RowStream; @@ -93,21 +92,6 @@ impl Notification { } } -/// An asynchronous message from the server. -#[allow(clippy::large_enum_variant)] -#[derive(Debug, Clone)] -#[non_exhaustive] -pub enum AsyncMessage { - /// A notice. - /// - /// Notices use the same format as errors, but aren't "errors" per-se. - Notice(DbError), - /// A notification. - /// - /// Connections can subscribe to notifications with the `LISTEN` command. - Notification(Notification), -} - /// Message returned by the `SimpleQuery` stream. #[derive(Debug)] #[non_exhaustive] diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 0ae13552b8..ea06725cfd 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -43,7 +43,7 @@ itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } byteorder = "1.4" -rand = "0.8.5" +rand.workspace = true [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index e895380192..f35d2a3081 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -81,7 +81,7 @@ impl UnreliableWrapper { /// fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); match attempts.entry(op) { Entry::Occupied(mut e) => { @@ -94,7 +94,7 @@ impl UnreliableWrapper { /* BEGIN_HADRON */ // If there are more attempts to fail, fail the request by probability. if (attempts_before_this < self.attempts_to_fail) - && (rng.gen_range(0..=100) < self.attempt_failure_probability) + && (rng.random_range(0..=100) < self.attempt_failure_probability) { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 4d7caabd39..949035b8c3 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -208,7 +208,7 @@ async fn create_azure_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().r#gen::(); + let random = rand::rng().random::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 6b893edf75..f5c81bf45d 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -385,7 +385,7 @@ async fn create_s3_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().r#gen::(); + let random = rand::rng().random::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 928e583b0b..1d09d6fc6d 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -9,7 +9,7 @@ anyhow.workspace = true const_format.workspace = true serde.workspace = true serde_json.workspace = true -postgres_ffi.workspace = true +postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true pq_proto.workspace = true tokio.workspace = true diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 59e112654b..a300c8464f 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -3,7 +3,7 @@ use std::net::SocketAddr; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::TimestampTz; +use postgres_ffi_types::TimestampTz; use postgres_versioninfo::PgVersionId; use serde::{Deserialize, Serialize}; use tokio::time::Instant; diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 0893aa173b..76782339da 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -1,11 +1,5 @@ //! Helper functions to set up OpenTelemetry tracing. //! -//! This comes in two variants, depending on whether you have a Tokio runtime available. -//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use -//! the current tokio runtime. If you don't have a runtime available, or you don't want -//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()` -//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks. -//! //! Example: //! //! ```rust,no_run @@ -21,7 +15,8 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await; +//! let provider = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()); +//! let otlp_layer = provider.as_ref().map(tracing_utils::layer); //! //! // Put it all together //! tracing_subscriber::registry() @@ -36,16 +31,18 @@ pub mod http; pub mod perf_span; -use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider; use opentelemetry_otlp::WithExportConfig; pub use opentelemetry_otlp::{ExportConfig, Protocol}; +use opentelemetry_sdk::trace::SdkTracerProvider; use tracing::level_filters::LevelFilter; use tracing::{Dispatch, Subscriber}; use tracing_subscriber::Layer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::registry::LookupSpan; +pub type Provider = SdkTracerProvider; + /// Set up OpenTelemetry exporter, using configuration from environment variables. /// /// `service_name` is set as the OpenTelemetry 'service.name' resource (see @@ -70,16 +67,7 @@ use tracing_subscriber::registry::LookupSpan; /// If you need some other setting, please test if it works first. And perhaps /// add a comment in the list above to save the effort of testing for the next /// person. -/// -/// This doesn't block, but is marked as 'async' to hint that this must be called in -/// asynchronous execution context. -pub async fn init_tracing( - service_name: &str, - export_config: ExportConfig, -) -> Option> -where - S: Subscriber + for<'span> LookupSpan<'span>, -{ +pub fn init_tracing(service_name: &str, export_config: ExportConfig) -> Option { if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; @@ -89,52 +77,14 @@ where )) } -/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing -/// tasks. -pub fn init_tracing_without_runtime( - service_name: &str, - export_config: ExportConfig, -) -> Option> +pub fn layer(p: &Provider) -> impl Layer where S: Subscriber + for<'span> LookupSpan<'span>, { - if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { - return None; - }; - - // The opentelemetry batch processor and the OTLP exporter needs a Tokio - // runtime. Create a dedicated runtime for them. One thread should be - // enough. - // - // (Alternatively, instead of batching, we could use the "simple - // processor", which doesn't need Tokio, and use "reqwest-blocking" - // feature for the OTLP exporter, which also doesn't need Tokio. However, - // batching is considered best practice, and also I have the feeling that - // the non-Tokio codepaths in the opentelemetry crate are less used and - // might be more buggy, so better to stay on the well-beaten path.) - // - // We leak the runtime so that it keeps running after we exit the - // function. - let runtime = Box::leak(Box::new( - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .thread_name("otlp runtime thread") - .worker_threads(1) - .build() - .unwrap(), - )); - let _guard = runtime.enter(); - - Some(init_tracing_internal( - service_name.to_string(), - export_config, - )) + tracing_opentelemetry::layer().with_tracer(p.tracer("global")) } -fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> impl Layer -where - S: Subscriber + for<'span> LookupSpan<'span>, -{ +fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> Provider { // Sets up exporter from the provided [`ExportConfig`] parameter. // If the endpoint is not specified, it is loaded from the // OTEL_EXPORTER_OTLP_ENDPOINT environment variable. @@ -153,22 +103,14 @@ where opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - let tracer = opentelemetry_sdk::trace::TracerProvider::builder() - .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio) - .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - service_name, - )])) + Provider::builder() + .with_batch_exporter(exporter) + .with_resource( + opentelemetry_sdk::Resource::builder() + .with_service_name(service_name) + .build(), + ) .build() - .tracer("global"); - - tracing_opentelemetry::layer().with_tracer(tracer) -} - -// Shutdown trace pipeline gracefully, so that it has a chance to send any -// pending traces before we exit. -pub fn shutdown_tracing() { - opentelemetry::global::shutdown_tracer_provider(); } pub enum OtelEnablement { @@ -176,17 +118,17 @@ pub enum OtelEnablement { Enabled { service_name: String, export_config: ExportConfig, - runtime: &'static tokio::runtime::Runtime, }, } pub struct OtelGuard { + provider: Provider, pub dispatch: Dispatch, } impl Drop for OtelGuard { fn drop(&mut self) { - shutdown_tracing(); + _ = self.provider.shutdown(); } } @@ -199,22 +141,19 @@ impl Drop for OtelGuard { /// The lifetime of the guard should match taht of the application. On drop, it tears down the /// OTEL infra. pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option { - let otel_subscriber = match otel_enablement { + match otel_enablement { OtelEnablement::Disabled => None, OtelEnablement::Enabled { service_name, export_config, - runtime, } => { - let otel_layer = runtime - .block_on(init_tracing(&service_name, export_config)) - .with_filter(LevelFilter::INFO); + let provider = init_tracing(&service_name, export_config)?; + + let otel_layer = layer(&provider).with_filter(LevelFilter::INFO); let otel_subscriber = tracing_subscriber::registry().with(otel_layer); - let otel_dispatch = Dispatch::new(otel_subscriber); + let dispatch = Dispatch::new(otel_subscriber); - Some(otel_dispatch) + Some(OtelGuard { dispatch, provider }) } - }; - - otel_subscriber.map(|dispatch| OtelGuard { dispatch }) + } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 7b1dc56071..4b326949d7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -47,6 +47,7 @@ tracing-subscriber = { workspace = true, features = ["json", "registry"] } tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true +uuid.workspace = true strum.workspace = true strum_macros.workspace = true walkdir.workspace = true diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index de3a964d23..b2aade15de 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -12,7 +12,8 @@ use jsonwebtoken::{ Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, }; use pem::Pem; -use serde::{Deserialize, Serialize, de::DeserializeOwned}; +use serde::{Deserialize, Deserializer, Serialize, de::DeserializeOwned}; +use uuid::Uuid; use crate::id::TenantId; @@ -25,6 +26,11 @@ pub enum Scope { /// Provides access to all data for a specific tenant (specified in `struct Claims` below) // TODO: join these two? Tenant, + /// Provides access to all data for a specific tenant, but based on endpoint ID. This token scope + /// is only used by compute to fetch the spec for a specific endpoint. The spec contains a Tenant-scoped + /// token authorizing access to all data of a tenant, so the spec-fetch API requires a TenantEndpoint + /// scope token to ensure that untrusted compute nodes can't fetch spec for arbitrary endpoints. + TenantEndpoint, /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. /// Should only be used e.g. for status check/tenant creation/list. PageServerApi, @@ -51,17 +57,43 @@ pub enum Scope { ControllerPeer, } +fn deserialize_empty_string_as_none_uuid<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let opt = Option::::deserialize(deserializer)?; + match opt.as_deref() { + Some("") => Ok(None), + Some(s) => Uuid::parse_str(s) + .map(Some) + .map_err(serde::de::Error::custom), + None => Ok(None), + } +} + /// JWT payload. See docs/authentication.md for the format #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub struct Claims { #[serde(default)] pub tenant_id: Option, + #[serde( + default, + skip_serializing_if = "Option::is_none", + // Neon control plane includes this field as empty in the claims. + // Consider it None in those cases. + deserialize_with = "deserialize_empty_string_as_none_uuid" + )] + pub endpoint_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { tenant_id, scope } + Self { + tenant_id, + scope, + endpoint_id: None, + } } } @@ -212,6 +244,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let expected_claims = Claims { tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, + endpoint_id: None, }; // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519: @@ -240,6 +273,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let claims = Claims { tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()), scope: Scope::Tenant, + endpoint_id: None, }; let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap(); diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index e3037aec21..d63bba75a3 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -104,7 +104,7 @@ impl Id { pub fn generate() -> Self { let mut tli_buf = [0u8; 16]; - rand::thread_rng().fill(&mut tli_buf); + rand::rng().fill(&mut tli_buf); Id::from(tli_buf) } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 31e1dda23d..1abb63817b 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -364,42 +364,37 @@ impl MonotonicCounter for RecordLsn { } } -/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s. +/// Implements [`rand::distr::uniform::UniformSampler`] so we can sample [`Lsn`]s. /// /// This is used by the `pagebench` pageserver benchmarking tool. -pub struct LsnSampler(::Sampler); +pub struct LsnSampler(::Sampler); -impl rand::distributions::uniform::SampleUniform for Lsn { +impl rand::distr::uniform::SampleUniform for Lsn { type Sampler = LsnSampler; } -impl rand::distributions::uniform::UniformSampler for LsnSampler { +impl rand::distr::uniform::UniformSampler for LsnSampler { type X = Lsn; - fn new(low: B1, high: B2) -> Self + fn new(low: B1, high: B2) -> Result where - B1: rand::distributions::uniform::SampleBorrow + Sized, - B2: rand::distributions::uniform::SampleBorrow + Sized, + B1: rand::distr::uniform::SampleBorrow + Sized, + B2: rand::distr::uniform::SampleBorrow + Sized, { - Self( - ::Sampler::new( - low.borrow().0, - high.borrow().0, - ), - ) + ::Sampler::new(low.borrow().0, high.borrow().0) + .map(Self) } - fn new_inclusive(low: B1, high: B2) -> Self + fn new_inclusive(low: B1, high: B2) -> Result where - B1: rand::distributions::uniform::SampleBorrow + Sized, - B2: rand::distributions::uniform::SampleBorrow + Sized, + B1: rand::distr::uniform::SampleBorrow + Sized, + B2: rand::distr::uniform::SampleBorrow + Sized, { - Self( - ::Sampler::new_inclusive( - low.borrow().0, - high.borrow().0, - ), + ::Sampler::new_inclusive( + low.borrow().0, + high.borrow().0, ) + .map(Self) } fn sample(&self, rng: &mut R) -> Self::X { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 5a0edf8cea..6ad6cab3a8 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -25,6 +25,12 @@ pub struct ShardIndex { pub shard_count: ShardCount, } +/// Stripe size as number of pages. +/// +/// NB: don't implement Default, so callers don't lazily use it by mistake. See DEFAULT_STRIPE_SIZE. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardStripeSize(pub u32); + /// Formatting helper, for generating the `shard_id` label in traces. pub struct ShardSlug<'a>(&'a TenantShardId); @@ -177,6 +183,12 @@ impl std::fmt::Display for ShardCount { } } +impl std::fmt::Display for ShardStripeSize { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( diff --git a/libs/wal_decoder/src/models/record.rs b/libs/wal_decoder/src/models/record.rs index 51659ed904..a37e1473e0 100644 --- a/libs/wal_decoder/src/models/record.rs +++ b/libs/wal_decoder/src/models/record.rs @@ -2,7 +2,8 @@ use bytes::Bytes; use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; -use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; +use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; +use postgres_ffi_types::TimestampTz; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 5f856a44d4..825a137d0f 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -431,7 +431,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { let empty_wal_rate_limiter = crate::bindings::WalRateLimiter { should_limit: crate::bindings::pg_atomic_uint32 { value: 0 }, sent_bytes: 0, - last_recorded_time_us: 0, + last_recorded_time_us: crate::bindings::pg_atomic_uint64 { value: 0 }, }; crate::bindings::WalproposerShmemState { diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index e1444778b8..284cc4d67d 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -11,7 +11,8 @@ use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; -use rand::prelude::{SeedableRng, SliceRandom, StdRng}; +use rand::prelude::{SeedableRng, StdRng}; +use rand::seq::IndexedRandom; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index fe1ddc2e7d..3867e536f4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -873,6 +873,22 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn reset_alert_gauges(&self) -> Result<()> { + let uri = format!( + "{}/hadron-internal/reset_alert_gauges", + self.mgmt_api_endpoint + ); + self.start_request(Method::POST, uri) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn wait_lsn( &self, tenant_shard_id: TenantShardId, diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index 3a9edc7092..e6a90fb582 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -16,10 +16,9 @@ use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool} use crate::retry::Retry; use crate::split::GetPageSplitter; use compute_api::spec::PageserverProtocol; -use pageserver_api::shard::ShardStripeSize; use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; -use utils::shard::{ShardCount, ShardIndex, ShardNumber}; +use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize}; /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up /// when full. @@ -141,8 +140,8 @@ impl PageserverClient { if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size { return Err(anyhow!( "can't change stripe size from {} to {}", - old.stripe_size, - shard_spec.stripe_size + old.stripe_size.expect("always Some when sharded"), + shard_spec.stripe_size.expect("always Some when sharded") )); } @@ -157,23 +156,6 @@ impl PageserverClient { Ok(()) } - /// Returns whether a relation exists. - #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] - pub async fn check_rel_exists( - &self, - req: page_api::CheckRelExistsRequest, - ) -> tonic::Result { - debug!("sending request: {req:?}"); - let resp = Self::with_retries(CALL_TIMEOUT, async |_| { - // Relation metadata is only available on shard 0. - let mut client = self.shards.load_full().get_zero().client().await?; - Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await - }) - .await?; - debug!("received response: {resp:?}"); - Ok(resp) - } - /// Returns the total size of a database, as # of bytes. #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))] pub async fn get_db_size( @@ -249,13 +231,15 @@ impl PageserverClient { // Fast path: request is for a single shard. if let Some(shard_id) = GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size) + .map_err(|err| tonic::Status::internal(err.to_string()))? { return Self::get_page_with_shard(req, shards.get(shard_id)?).await; } // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and // reassemble the responses. - let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size); + let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size) + .map_err(|err| tonic::Status::internal(err.to_string()))?; let mut shard_requests = FuturesUnordered::new(); for (shard_id, shard_req) in splitter.drain_requests() { @@ -265,10 +249,14 @@ impl PageserverClient { } while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { - splitter.add_response(shard_id, shard_response)?; + splitter + .add_response(shard_id, shard_response) + .map_err(|err| tonic::Status::internal(err.to_string()))?; } - splitter.get_response() + splitter + .get_response() + .map_err(|err| tonic::Status::internal(err.to_string())) } /// Fetches pages on the given shard. Does not retry internally. @@ -396,12 +384,14 @@ pub struct ShardSpec { /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. count: ShardCount, /// The stripe size for these shards. - stripe_size: ShardStripeSize, + /// + /// INVARIANT: None for unsharded tenants, Some for sharded. + stripe_size: Option, } impl ShardSpec { /// Creates a new shard spec with the given URLs and stripe size. All shards must be given. - /// The stripe size may be omitted for unsharded tenants. + /// The stripe size must be Some for sharded tenants, or None for unsharded tenants. pub fn new( urls: HashMap, stripe_size: Option, @@ -414,11 +404,13 @@ impl ShardSpec { n => ShardCount::new(n as u8), }; - // Determine the stripe size. It doesn't matter for unsharded tenants. + // Validate the stripe size. if stripe_size.is_none() && !count.is_unsharded() { return Err(anyhow!("stripe size must be given for sharded tenants")); } - let stripe_size = stripe_size.unwrap_or_default(); + if stripe_size.is_some() && count.is_unsharded() { + return Err(anyhow!("stripe size can't be given for unsharded tenants")); + } // Validate the shard spec. for (shard_id, url) in &urls { @@ -458,8 +450,10 @@ struct Shards { /// /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. count: ShardCount, - /// The stripe size. Only used for sharded tenants. - stripe_size: ShardStripeSize, + /// The stripe size. + /// + /// INVARIANT: None for unsharded tenants, Some for sharded. + stripe_size: Option, } impl Shards { diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs index b7539b900c..8631638686 100644 --- a/pageserver/client_grpc/src/split.rs +++ b/pageserver/client_grpc/src/split.rs @@ -1,11 +1,12 @@ use std::collections::HashMap; +use anyhow::anyhow; use bytes::Bytes; use pageserver_api::key::rel_block_to_key; -use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; +use pageserver_api::shard::key_to_shard_number; use pageserver_page_api as page_api; -use utils::shard::{ShardCount, ShardIndex, ShardNumber}; +use utils::shard::{ShardCount, ShardIndex, ShardStripeSize}; /// Splits GetPageRequests that straddle shard boundaries and assembles the responses. /// TODO: add tests for this. @@ -25,43 +26,54 @@ impl GetPageSplitter { pub fn for_single_shard( req: &page_api::GetPageRequest, count: ShardCount, - stripe_size: ShardStripeSize, - ) -> Option { + stripe_size: Option, + ) -> anyhow::Result> { // Fast path: unsharded tenant. if count.is_unsharded() { - return Some(ShardIndex::unsharded()); + return Ok(Some(ShardIndex::unsharded())); } - // Find the first page's shard, for comparison. If there are no pages, just return the first - // shard (caller likely checked already, otherwise the server will reject it). + let Some(stripe_size) = stripe_size else { + return Err(anyhow!("stripe size must be given for sharded tenants")); + }; + + // Find the first page's shard, for comparison. let Some(&first_page) = req.block_numbers.first() else { - return Some(ShardIndex::new(ShardNumber(0), count)); + return Err(anyhow!("no block numbers in request")); }; let key = rel_block_to_key(req.rel, first_page); let shard_number = key_to_shard_number(count, stripe_size, &key); - req.block_numbers + Ok(req + .block_numbers .iter() .skip(1) // computed above .all(|&blkno| { let key = rel_block_to_key(req.rel, blkno); key_to_shard_number(count, stripe_size, &key) == shard_number }) - .then_some(ShardIndex::new(shard_number, count)) + .then_some(ShardIndex::new(shard_number, count))) } /// Splits the given request. pub fn split( req: page_api::GetPageRequest, count: ShardCount, - stripe_size: ShardStripeSize, - ) -> Self { + stripe_size: Option, + ) -> anyhow::Result { // The caller should make sure we don't split requests unnecessarily. debug_assert!( - Self::for_single_shard(&req, count, stripe_size).is_none(), + Self::for_single_shard(&req, count, stripe_size)?.is_none(), "unnecessary request split" ); + if count.is_unsharded() { + return Err(anyhow!("unsharded tenant, no point in splitting request")); + } + let Some(stripe_size) = stripe_size else { + return Err(anyhow!("stripe size must be given for sharded tenants")); + }; + // Split the requests by shard index. let mut requests = HashMap::with_capacity(2); // common case let mut block_shards = Vec::with_capacity(req.block_numbers.len()); @@ -103,11 +115,11 @@ impl GetPageSplitter { .collect(), }; - Self { + Ok(Self { requests, response, block_shards, - } + }) } /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations. @@ -124,21 +136,30 @@ impl GetPageSplitter { &mut self, shard_id: ShardIndex, response: page_api::GetPageResponse, - ) -> tonic::Result<()> { + ) -> anyhow::Result<()> { // The caller should already have converted status codes into tonic::Status. if response.status_code != page_api::GetPageStatusCode::Ok { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "unexpected non-OK response for shard {shard_id}: {} {}", response.status_code, response.reason.unwrap_or_default() - ))); + )); } if response.request_id != self.response.request_id { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "response ID mismatch for shard {shard_id}: expected {}, got {}", - self.response.request_id, response.request_id - ))); + self.response.request_id, + response.request_id + )); + } + + if response.request_id != self.response.request_id { + return Err(anyhow!( + "response ID mismatch for shard {shard_id}: expected {}, got {}", + self.response.request_id, + response.request_id + )); } // Place the shard response pages into the assembled response, in request order. @@ -150,27 +171,26 @@ impl GetPageSplitter { } let Some(slot) = self.response.pages.get_mut(i) else { - return Err(tonic::Status::internal(format!( - "no block_shards slot {i} for shard {shard_id}" - ))); + return Err(anyhow!("no block_shards slot {i} for shard {shard_id}")); }; let Some(page) = pages.next() else { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "missing page {} in shard {shard_id} response", slot.block_number - ))); + )); }; if page.block_number != slot.block_number { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "shard {shard_id} returned wrong page at index {i}, expected {} got {}", - slot.block_number, page.block_number - ))); + slot.block_number, + page.block_number + )); } if !slot.image.is_empty() { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "shard {shard_id} returned duplicate page {} at index {i}", slot.block_number - ))); + )); } *slot = page; @@ -178,10 +198,10 @@ impl GetPageSplitter { // Make sure we've consumed all pages from the shard response. if let Some(extra_page) = pages.next() { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "shard {shard_id} returned extra page: {}", extra_page.block_number - ))); + )); } Ok(()) @@ -189,18 +209,18 @@ impl GetPageSplitter { /// Fetches the final, assembled response. #[allow(clippy::result_large_err)] - pub fn get_response(self) -> tonic::Result { + pub fn get_response(self) -> anyhow::Result { // Check that the response is complete. for (i, page) in self.response.pages.iter().enumerate() { if page.image.is_empty() { - return Err(tonic::Status::internal(format!( + return Err(anyhow!( "missing page {} for shard {}", page.block_number, self.block_shards .get(i) .map(|s| s.to_string()) .unwrap_or_else(|| "?".to_string()) - ))); + )); } } diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index dd35417333..6211b86809 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -89,7 +89,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> let cold_key_range = splitpoint..key_range.end; for i in 0..cmd.num_records { - let chosen_range = if rand::thread_rng().gen_bool(0.9) { + let chosen_range = if rand::rng().random_bool(0.9) { &hot_key_range } else { &cold_key_range diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index bf9f6f2658..44507c335b 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -300,9 +300,9 @@ impl MockTimeline { key_range: &Range, ) -> anyhow::Result<()> { crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]); - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); for _ in 0..num_records { - self.ingest_record(rng.gen_range(key_range.clone()), len); + self.ingest_record(rng.random_range(key_range.clone()), len); self.wal_ingested += len; } Ok(()) diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index c4daafdfd0..75bab94757 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -4,7 +4,7 @@ use anyhow::Context; use clap::Parser; use pageserver_api::key::Key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use pageserver_api::shard::{ShardCount, ShardStripeSize}; +use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize}; #[derive(Parser)] pub(super) struct DescribeKeyCommand { @@ -128,7 +128,9 @@ impl DescribeKeyCommand { // seeing the sharding placement might be confusing, so leave it out unless shard // count was given. - let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default(); + let stripe_size = stripe_size + .map(ShardStripeSize) + .unwrap_or(DEFAULT_STRIPE_SIZE); println!( "# placement with shard_count: {} and stripe_size: {}:", shard_count.0, stripe_size.0 diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index d113a04a42..aaccbd5ef0 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -17,11 +17,11 @@ // grpcurl \ // -plaintext \ // -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ -// -H "neon-shard-id: 0b10" \ +// -H "neon-shard-id: 0000" \ // -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ // -H "authorization: Bearer $JWT" \ -// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}' -// localhost:51051 page_api.PageService/CheckRelExists +// -d '{"read_lsn": {"request_lsn": 100000000, "not_modified_since_lsn": 1}, "db_oid": 1}' \ +// localhost:51051 page_api.PageService/GetDbSize // ``` // // TODO: consider adding neon-compute-mode ("primary", "static", "replica"). @@ -38,8 +38,8 @@ package page_api; import "google/protobuf/timestamp.proto"; service PageService { - // Returns whether a relation exists. - rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); + // NB: unlike libpq, there is no CheckRelExists in gRPC, at the compute team's request. Instead, + // use GetRelSize with allow_missing=true to check existence. // Fetches a base backup. rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); @@ -97,17 +97,6 @@ message RelTag { uint32 fork_number = 4; } -// Checks whether a relation exists, at the given LSN. Only valid on shard 0, -// other shards will error. -message CheckRelExistsRequest { - ReadLsn read_lsn = 1; - RelTag rel = 2; -} - -message CheckRelExistsResponse { - bool exists = 1; -} - // Requests a base backup. message GetBaseBackupRequest { // The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver. @@ -260,10 +249,15 @@ enum GetPageStatusCode { message GetRelSizeRequest { ReadLsn read_lsn = 1; RelTag rel = 2; + // If true, return missing=true for missing relations instead of a NotFound error. + bool allow_missing = 3; } message GetRelSizeResponse { + // The number of blocks in the relation. uint32 num_blocks = 1; + // If allow_missing=true, this is true for missing relations. + bool missing = 2; } // Requests an SLRU segment. Only valid on shard 0, other shards will error. diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index f70d0e7b28..fc27ea448b 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -69,16 +69,6 @@ impl Client { Ok(Self { inner }) } - /// Returns whether a relation exists. - pub async fn check_rel_exists( - &mut self, - req: CheckRelExistsRequest, - ) -> tonic::Result { - let req = proto::CheckRelExistsRequest::from(req); - let resp = self.inner.check_rel_exists(req).await?.into_inner(); - Ok(resp.into()) - } - /// Fetches a base backup. pub async fn get_base_backup( &mut self, @@ -114,7 +104,8 @@ impl Client { Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into())))) } - /// Returns the size of a relation, as # of blocks. + /// Returns the size of a relation as # of blocks, or None if allow_missing=true and the + /// relation does not exist. pub async fn get_rel_size( &mut self, req: GetRelSizeRequest, diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index a3286ecf15..6375c47998 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -139,50 +139,6 @@ impl From for proto::RelTag { } } -/// Checks whether a relation exists, at the given LSN. Only valid on shard 0, other shards error. -#[derive(Clone, Copy, Debug)] -pub struct CheckRelExistsRequest { - pub read_lsn: ReadLsn, - pub rel: RelTag, -} - -impl TryFrom for CheckRelExistsRequest { - type Error = ProtocolError; - - fn try_from(pb: proto::CheckRelExistsRequest) -> Result { - Ok(Self { - read_lsn: pb - .read_lsn - .ok_or(ProtocolError::Missing("read_lsn"))? - .try_into()?, - rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, - }) - } -} - -impl From for proto::CheckRelExistsRequest { - fn from(request: CheckRelExistsRequest) -> Self { - Self { - read_lsn: Some(request.read_lsn.into()), - rel: Some(request.rel.into()), - } - } -} - -pub type CheckRelExistsResponse = bool; - -impl From for CheckRelExistsResponse { - fn from(pb: proto::CheckRelExistsResponse) -> Self { - pb.exists - } -} - -impl From for proto::CheckRelExistsResponse { - fn from(exists: CheckRelExistsResponse) -> Self { - Self { exists } - } -} - /// Requests a base backup. #[derive(Clone, Copy, Debug)] pub struct GetBaseBackupRequest { @@ -707,6 +663,8 @@ impl From for tonic::Code { pub struct GetRelSizeRequest { pub read_lsn: ReadLsn, pub rel: RelTag, + /// If true, return missing=true for missing relations instead of a NotFound error. + pub allow_missing: bool, } impl TryFrom for GetRelSizeRequest { @@ -719,6 +677,7 @@ impl TryFrom for GetRelSizeRequest { .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, rel: proto.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, + allow_missing: proto.allow_missing, }) } } @@ -728,21 +687,29 @@ impl From for proto::GetRelSizeRequest { Self { read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), + allow_missing: request.allow_missing, } } } -pub type GetRelSizeResponse = u32; +/// The size of a relation as number of blocks, or None if `allow_missing=true` and the relation +/// does not exist. +/// +/// INVARIANT: never None if `allow_missing=false` (returns `NotFound` error instead). +pub type GetRelSizeResponse = Option; impl From for GetRelSizeResponse { - fn from(proto: proto::GetRelSizeResponse) -> Self { - proto.num_blocks + fn from(pb: proto::GetRelSizeResponse) -> Self { + (!pb.missing).then_some(pb.num_blocks) } } impl From for proto::GetRelSizeResponse { - fn from(num_blocks: GetRelSizeResponse) -> Self { - Self { num_blocks } + fn from(resp: GetRelSizeResponse) -> Self { + Self { + num_blocks: resp.unwrap_or_default(), + missing: resp.is_none(), + } } } diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index c14bb73136..01875f74b9 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -188,9 +188,9 @@ async fn main_impl( start_work_barrier.wait().await; loop { let (timeline, work) = { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let target = all_targets.choose(&mut rng).unwrap(); - let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); + let lsn = target.lsn_range.clone().map(|r| rng.random_range(r)); (target.timeline, Work { lsn }) }; let sender = work_senders.get(&timeline).unwrap(); diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 30b30d36f6..ed7fe9c4ea 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -326,8 +326,7 @@ async fn main_impl( .cloned() .collect(); let weights = - rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())) - .unwrap(); + rand::distr::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())).unwrap(); Box::pin(async move { let scheme = match Url::parse(&args.page_service_connstring) { @@ -427,7 +426,7 @@ async fn run_worker( cancel: CancellationToken, rps_period: Option, ranges: Vec, - weights: rand::distributions::weighted::WeightedIndex, + weights: rand::distr::weighted::WeightedIndex, ) { shared_state.start_work_barrier.wait().await; let client_start = Instant::now(); @@ -469,9 +468,9 @@ async fn run_worker( } // Pick a random page from a random relation. - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); + let key: i128 = rng.random_range(r.start..r.end); let (rel_tag, block_no) = key_to_block(key); let mut blks = VecDeque::with_capacity(batch_size); @@ -502,7 +501,7 @@ async fn run_worker( // We assume that the entire batch can fit within the relation. assert_eq!(blks.len(), batch_size, "incomplete batch"); - let req_lsn = if rng.gen_bool(args.req_latest_probability) { + let req_lsn = if rng.random_bool(args.req_latest_probability) { Lsn::MAX } else { r.timeline_lsn diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 9ff1e638c4..8fbb452140 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -7,7 +7,7 @@ use std::time::{Duration, Instant}; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; -use rand::seq::SliceRandom; +use rand::seq::IndexedMutRandom; use tokio::sync::{OwnedSemaphorePermit, mpsc}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -260,7 +260,7 @@ async fn timeline_actor( loop { let layer_tx = { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); timeline.layers.choose_mut(&mut rng).expect("no layers") }; match layer_tx.try_send(permit.take().unwrap()) { diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 4075427ab4..9e97fdaba8 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< | Scope::GenerationsApi | Scope::Infra | Scope::Scrubber - | Scope::ControllerPeer, + | Scope::ControllerPeer + | Scope::TenantEndpoint, _, ) => Err(AuthError( format!( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 1a44c80e2d..1f1a3f8157 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -11,6 +11,7 @@ //! from data stored in object storage. //! use std::fmt::Write as FmtWrite; +use std::sync::Arc; use std::time::{Instant, SystemTime}; use anyhow::{Context, anyhow}; @@ -420,12 +421,16 @@ where } let mut min_restart_lsn: Lsn = Lsn::MAX; + + let mut dbdir_cnt = 0; + let mut rel_cnt = 0; + // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn, self.ctx).await? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; - + dbdir_cnt += 1; // If full backup is requested, include all relation files. // Otherwise only include init forks of unlogged relations. let rels = self @@ -433,6 +438,7 @@ where .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; for &rel in rels.iter() { + rel_cnt += 1; // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in // `reinit.c` during recovery. @@ -455,6 +461,10 @@ where } } + self.timeline + .db_rel_count + .store(Some(Arc::new((dbdir_cnt, rel_cnt)))); + let start_time = Instant::now(); let aux_files = self .timeline diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index dfb8b437c3..855af7009c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> { Some(cfg) => tracing_utils::OtelEnablement::Enabled { service_name: "pageserver".to_string(), export_config: (&cfg.export_config).into(), - runtime: *COMPUTE_REQUEST_RUNTIME, }, None => tracing_utils::OtelEnablement::Disabled, }; diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index be1de43d18..96829bd6ea 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::net::IpAddr; use futures::Future; use pageserver_api::config::NodeMetadata; @@ -16,7 +17,7 @@ use tokio_util::sync::CancellationToken; use url::Url; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; -use utils::{backoff, failpoint_support}; +use utils::{backoff, failpoint_support, ip_address}; use crate::config::PageServerConf; use crate::virtual_file::on_fatal_io_error; @@ -27,6 +28,7 @@ pub struct StorageControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, + node_ip_addr: Option, cancel: CancellationToken, } @@ -40,6 +42,7 @@ pub trait StorageControllerUpcallApi { fn re_attach( &self, conf: &PageServerConf, + empty_local_disk: bool, ) -> impl Future< Output = Result, RetryForeverError>, > + Send; @@ -91,11 +94,18 @@ impl StorageControllerUpcallClient { ); } + // Intentionally panics if we encountered any errors parsing or reading the IP address. + // Note that if the required environment variable is not set, `read_node_ip_addr_from_env` returns `Ok(None)` + // instead of an error. + let node_ip_addr = + ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address."); + Self { http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), + node_ip_addr, } } @@ -146,6 +156,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { async fn re_attach( &self, conf: &PageServerConf, + empty_local_disk: bool, ) -> Result, RetryForeverError> { let url = self .base_url @@ -193,8 +204,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { listen_http_addr: m.http_host, listen_http_port: m.http_port, listen_https_port: m.https_port, + node_ip_addr: self.node_ip_addr, availability_zone_id: az_id.expect("Checked above"), - node_ip_addr: None, }) } Err(e) => { @@ -217,6 +228,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = ReAttachRequest { node_id: self.node_id, register: register.clone(), + empty_local_disk: Some(empty_local_disk), }; let response: ReAttachResponse = self diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 7854fd9e36..51581ccc2c 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -768,6 +768,7 @@ mod test { async fn re_attach( &self, _conf: &PageServerConf, + _empty_local_disk: bool, ) -> Result, RetryForeverError> { unimplemented!() } diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 363b1427f5..c9bfbd8adc 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -1,5 +1,5 @@ //! The validator is responsible for validating DeletionLists for execution, -//! based on whethe the generation in the DeletionList is still the latest +//! based on whether the generation in the DeletionList is still the latest //! generation for a tenant. //! //! The purpose of validation is to ensure split-brain safety in the cluster diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index f0178fd9b3..678d7e052b 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -155,7 +155,9 @@ impl FeatureResolver { ); let tenant_properties = PerTenantProperties { - remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)), + remote_size_mb: Some(rand::rng().random_range(100.0..1000000.00)), + db_count_max: Some(rand::rng().random_range(1..1000)), + rel_count_max: Some(rand::rng().random_range(1..1000)), } .into_posthog_properties(); @@ -344,6 +346,8 @@ impl FeatureResolver { struct PerTenantProperties { pub remote_size_mb: Option, + pub db_count_max: Option, + pub rel_count_max: Option, } impl PerTenantProperties { @@ -355,6 +359,18 @@ impl PerTenantProperties { PostHogFlagFilterPropertyValue::Number(remote_size_mb), ); } + if let Some(db_count) = self.db_count_max { + properties.insert( + "tenant_db_count_max".to_string(), + PostHogFlagFilterPropertyValue::Number(db_count as f64), + ); + } + if let Some(rel_count) = self.rel_count_max { + properties.insert( + "tenant_rel_count_max".to_string(), + PostHogFlagFilterPropertyValue::Number(rel_count as f64), + ); + } properties } } @@ -409,7 +425,11 @@ impl TenantFeatureResolver { /// Refresh the cached properties and flags on the critical path. pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { + // Any of the remote size is none => this property is none. let mut remote_size_mb = Some(0.0); + // Any of the db or rel count is available => this property is available. + let mut db_count_max = None; + let mut rel_count_max = None; for timeline in tenant_shard.list_timelines() { let size = timeline.metrics.resident_physical_size_get(); if size == 0 { @@ -419,9 +439,25 @@ impl TenantFeatureResolver { if let Some(ref mut remote_size_mb) = remote_size_mb { *remote_size_mb += size as f64 / 1024.0 / 1024.0; } + if let Some(data) = timeline.db_rel_count.load_full() { + let (db_count, rel_count) = *data.as_ref(); + if db_count_max.is_none() { + db_count_max = Some(db_count); + } + if rel_count_max.is_none() { + rel_count_max = Some(rel_count); + } + db_count_max = db_count_max.map(|max| max.max(db_count)); + rel_count_max = rel_count_max.map(|max| max.max(rel_count)); + } } self.cached_tenant_properties.store(Arc::new( - PerTenantProperties { remote_size_mb }.into_posthog_properties(), + PerTenantProperties { + remote_size_mb, + db_count_max, + rel_count_max, + } + .into_posthog_properties(), )); // BEGIN: Update the feature flag on the critical path. diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3e844a375d..3a08244d71 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2357,6 +2357,7 @@ async fn timeline_compact_handler( flags, sub_compaction, sub_compaction_max_job_size_mb, + gc_compaction_do_metadata_compaction: false, }; let scheduled = compact_request diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1fc7e4eac7..26a23da66f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1636,9 +1636,10 @@ impl PageServerHandler { let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ - Self::handle_get_nblocks_request(&shard, &req, &ctx) + Self::handle_get_nblocks_request(&shard, &req, false, &ctx) .instrument(span.clone()) .await + .map(|msg| msg.expect("allow_missing=false")) .map(|msg| (PagestreamBeMessage::Nblocks(msg), timer, ctx)) .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), ], @@ -2303,12 +2304,16 @@ impl PageServerHandler { Ok(PagestreamExistsResponse { req: *req, exists }) } + /// If `allow_missing` is true, returns None instead of Err on missing relations. Otherwise, + /// never returns None. It is only supported by the gRPC protocol, so we pass it separately to + /// avoid changing the libpq protocol types. #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( timeline: &Timeline, req: &PagestreamNblocksRequest, + allow_missing: bool, ctx: &RequestContext, - ) -> Result { + ) -> Result, PageStreamError> { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, @@ -2320,20 +2325,25 @@ impl PageServerHandler { .await?; let n_blocks = timeline - .get_rel_size( + .get_rel_size_in_reldir( req.rel, Version::LsnRange(LsnRange { effective_lsn: lsn, request_lsn: req.hdr.request_lsn, }), + None, + allow_missing, ctx, ) .await?; + let Some(n_blocks) = n_blocks else { + return Ok(None); + }; - Ok(PagestreamNblocksResponse { + Ok(Some(PagestreamNblocksResponse { req: *req, n_blocks, - }) + })) } #[instrument(skip_all, fields(shard_id))] @@ -3218,13 +3228,25 @@ where pub struct GrpcPageServiceHandler { tenant_manager: Arc, ctx: RequestContext, + + /// Cancelled to shut down the server. Tonic will shut down in response to this, but wait for + /// in-flight requests to complete. Any tasks we spawn ourselves must respect this token. + cancel: CancellationToken, + + /// Any tasks we spawn ourselves should clone this gate guard, so that we can wait for them to + /// complete during shutdown. Request handlers implicitly hold this guard already. gate_guard: GateGuard, + + /// `get_vectored` concurrency setting. get_vectored_concurrent_io: GetVectoredConcurrentIo, } impl GrpcPageServiceHandler { /// Spawns a gRPC server for the page service. /// + /// Returns a `CancellableTask` handle that can be used to shut down the server. It waits for + /// any in-flight requests and tasks to complete first. + /// /// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we /// need to reimplement the TCP+TLS accept loop ourselves. pub fn spawn( @@ -3234,12 +3256,15 @@ impl GrpcPageServiceHandler { get_vectored_concurrent_io: GetVectoredConcurrentIo, listener: std::net::TcpListener, ) -> anyhow::Result { + // Set up a cancellation token for shutting down the server, and a gate to wait for all + // requests and spawned tasks to complete. let cancel = CancellationToken::new(); + let gate = Gate::default(); + let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler) .download_behavior(DownloadBehavior::Download) .perf_span_dispatch(perf_trace_dispatch) .detached_child(); - let gate = Gate::default(); // Set up the TCP socket. We take a preconfigured TcpListener to bind the // port early during startup. @@ -3270,6 +3295,7 @@ impl GrpcPageServiceHandler { let page_service_handler = GrpcPageServiceHandler { tenant_manager, ctx, + cancel: cancel.clone(), gate_guard: gate.enter().expect("gate was just created"), get_vectored_concurrent_io, }; @@ -3306,19 +3332,20 @@ impl GrpcPageServiceHandler { .build_v1()?; let server = server.add_service(reflection_service); - // Spawn server task. + // Spawn server task. It runs until the cancellation token fires and in-flight requests and + // tasks complete. The `CancellableTask` will wait for the task's join handle, which + // implicitly waits for the gate to close. let task_cancel = cancel.clone(); let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "grpc listener", + "grpc pageservice listener", async move { - let result = server + server .serve_with_incoming_shutdown(incoming, task_cancel.cancelled()) - .await; - if result.is_ok() { - // TODO: revisit shutdown logic once page service is implemented. - gate.close().await; - } - result + .await?; + // Server exited cleanly. All requests should have completed by now. Wait for any + // spawned tasks to complete as well (e.g. IoConcurrency sidecars) via the gate. + gate.close().await; + anyhow::Ok(()) }, )); @@ -3508,7 +3535,10 @@ impl GrpcPageServiceHandler { /// Implements the gRPC page service. /// -/// TODO: cancellation. +/// On client disconnect (e.g. timeout or client shutdown), Tonic will drop the request handler +/// futures, so the read path must be cancellation-safe. On server shutdown, Tonic will wait for +/// in-flight requests to complete. +/// /// TODO: when the libpq impl is removed, remove the Pagestream types and inline the handler code. #[tonic::async_trait] impl proto::PageService for GrpcPageServiceHandler { @@ -3519,39 +3549,6 @@ impl proto::PageService for GrpcPageServiceHandler { type GetPagesStream = Pin> + Send>>; - #[instrument(skip_all, fields(rel, lsn))] - async fn check_rel_exists( - &self, - req: tonic::Request, - ) -> Result, tonic::Status> { - let received_at = extract::(&req).0; - let timeline = self.get_request_timeline(&req).await?; - let ctx = self.ctx.with_scope_page_service_pagestream(&timeline); - - // Validate the request, decorate the span, and convert it to a Pagestream request. - Self::ensure_shard_zero(&timeline)?; - let req: page_api::CheckRelExistsRequest = req.into_inner().try_into()?; - - span_record!(rel=%req.rel, lsn=%req.read_lsn); - - let req = PagestreamExistsRequest { - hdr: Self::make_hdr(req.read_lsn, None), - rel: req.rel, - }; - - // Execute the request and convert the response. - let _timer = Self::record_op_start_and_throttle( - &timeline, - metrics::SmgrQueryType::GetRelExists, - received_at, - ) - .await?; - - let resp = PageServerHandler::handle_get_rel_exists_request(&timeline, &req, &ctx).await?; - let resp: page_api::CheckRelExistsResponse = resp.exists; - Ok(tonic::Response::new(resp.into())) - } - #[instrument(skip_all, fields(lsn))] async fn get_base_backup( &self, @@ -3593,8 +3590,14 @@ impl proto::PageService for GrpcPageServiceHandler { // Spawn a task to run the basebackup. let span = Span::current(); + let gate_guard = self + .gate_guard + .try_clone() + .map_err(|_| tonic::Status::unavailable("shutting down"))?; let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { + let _gate_guard = gate_guard; // keep gate open until task completes + let gzip_level = match req.compression { page_api::BaseBackupCompression::None => None, // NB: using fast compression because it's on the critical path for compute @@ -3718,15 +3721,17 @@ impl proto::PageService for GrpcPageServiceHandler { .await?; // Spawn an IoConcurrency sidecar, if enabled. - let Ok(gate_guard) = self.gate_guard.try_clone() else { - return Err(tonic::Status::unavailable("shutting down")); - }; + let gate_guard = self + .gate_guard + .try_clone() + .map_err(|_| tonic::Status::unavailable("shutting down"))?; let io_concurrency = IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard); - // Spawn a task to handle the GetPageRequest stream. + // Construct the GetPageRequest stream handler. let span = Span::current(); let ctx = self.ctx.attached_child(); + let cancel = self.cancel.clone(); let mut reqs = req.into_inner(); let resps = async_stream::try_stream! { @@ -3734,7 +3739,19 @@ impl proto::PageService for GrpcPageServiceHandler { .get(ttid.tenant_id, ttid.timeline_id, shard_selector) .await? .downgrade(); - while let Some(req) = reqs.message().await? { + loop { + // NB: Tonic considers the entire stream to be an in-flight request and will wait + // for it to complete before shutting down. React to cancellation between requests. + let req = tokio::select! { + biased; + _ = cancel.cancelled() => Err(tonic::Status::unavailable("shutting down")), + + result = reqs.message() => match result { + Ok(Some(req)) => Ok(req), + Ok(None) => break, // client closed the stream + Err(err) => Err(err), + }, + }?; let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default(); let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) .instrument(span.clone()) // propagate request span @@ -3758,7 +3775,7 @@ impl proto::PageService for GrpcPageServiceHandler { Ok(tonic::Response::new(Box::pin(resps))) } - #[instrument(skip_all, fields(rel, lsn))] + #[instrument(skip_all, fields(rel, lsn, allow_missing))] async fn get_rel_size( &self, req: tonic::Request, @@ -3770,8 +3787,9 @@ impl proto::PageService for GrpcPageServiceHandler { // Validate the request, decorate the span, and convert it to a Pagestream request. Self::ensure_shard_zero(&timeline)?; let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?; + let allow_missing = req.allow_missing; - span_record!(rel=%req.rel, lsn=%req.read_lsn); + span_record!(rel=%req.rel, lsn=%req.read_lsn, allow_missing=%req.allow_missing); let req = PagestreamNblocksRequest { hdr: Self::make_hdr(req.read_lsn, None), @@ -3786,8 +3804,11 @@ impl proto::PageService for GrpcPageServiceHandler { ) .await?; - let resp = PageServerHandler::handle_get_nblocks_request(&timeline, &req, &ctx).await?; - let resp: page_api::GetRelSizeResponse = resp.n_blocks; + let resp = + PageServerHandler::handle_get_nblocks_request(&timeline, &req, allow_missing, &ctx) + .await?; + let resp: page_api::GetRelSizeResponse = resp.map(|resp| resp.n_blocks); + Ok(tonic::Response::new(resp.into())) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 8532a6938f..c9f3184188 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -8,6 +8,7 @@ //! use std::collections::{HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; +use std::sync::Arc; use crate::walingest::{WalIngestError, WalIngestErrorKind}; use crate::{PERF_TRACE_TARGET, ensure_walingest}; @@ -25,9 +26,9 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId}; +use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId}; use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi_types::{Oid, RepOriginId}; +use postgres_ffi_types::{Oid, RepOriginId, TimestampTz}; use serde::{Deserialize, Serialize}; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; @@ -286,6 +287,10 @@ impl Timeline { /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages. /// /// The ordering of the returned vec corresponds to the ordering of `pages`. + /// + /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future + /// if the client goes away (e.g. due to timeout or cancellation). + /// TODO: verify that it actually is cancellation-safe. pub(crate) async fn get_rel_page_at_lsn_batched( &self, pages: impl ExactSizeIterator, @@ -500,8 +505,9 @@ impl Timeline { for rel in rels { let n_blocks = self - .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), ctx) - .await?; + .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), false, ctx) + .await? + .expect("allow_missing=false"); total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -517,10 +523,16 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { - self.get_rel_size_in_reldir(tag, version, None, ctx).await + Ok(self + .get_rel_size_in_reldir(tag, version, None, false, ctx) + .await? + .expect("allow_missing=false")) } - /// Get size of a relation file. The relation must exist, otherwise an error is returned. + /// Get size of a relation file. If `allow_missing` is true, returns None for missing relations, + /// otherwise errors. + /// + /// INVARIANT: never returns None if `allow_missing=false`. /// /// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`. pub(crate) async fn get_rel_size_in_reldir( @@ -528,8 +540,9 @@ impl Timeline { tag: RelTag, version: Version<'_>, deserialized_reldir_v1: Option<(Key, &RelDirectory)>, + allow_missing: bool, ctx: &RequestContext, - ) -> Result { + ) -> Result, PageReconstructError> { if tag.relnode == 0 { return Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -537,7 +550,15 @@ impl Timeline { } if let Some(nblocks) = self.get_cached_rel_size(&tag, version) { - return Ok(nblocks); + return Ok(Some(nblocks)); + } + + if allow_missing + && !self + .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx) + .await? + { + return Ok(None); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) @@ -549,7 +570,7 @@ impl Timeline { // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. - return Ok(0); + return Ok(Some(0)); } let key = rel_size_to_key(tag); @@ -558,7 +579,7 @@ impl Timeline { self.update_cached_rel_size(tag, version, nblocks); - Ok(nblocks) + Ok(Some(nblocks)) } /// Does the relation exist? @@ -813,6 +834,7 @@ impl Timeline { let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); let gc_cutoff_planned = { let gc_info = self.gc_info.read().unwrap(); + info!(cutoffs=?gc_info.cutoffs, applied_cutoff=%*gc_cutoff_lsn_guard, "starting find_lsn_for_timestamp"); gc_info.min_cutoff() }; // Usually the planned cutoff is newer than the cutoff of the last gc run, @@ -1233,11 +1255,16 @@ impl Timeline { let dbdir = DbDirectory::des(&buf)?; let mut total_size: u64 = 0; + let mut dbdir_cnt = 0; + let mut rel_cnt = 0; + for (spcnode, dbnode) in dbdir.dbdirs.keys() { + dbdir_cnt += 1; for rel in self .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx) .await? { + rel_cnt += 1; if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } @@ -1248,6 +1275,10 @@ impl Timeline { total_size += relsize as u64; } } + + self.db_rel_count + .store(Some(Arc::new((dbdir_cnt, rel_cnt)))); + Ok(total_size * BLCKSZ as u64) } @@ -2907,9 +2938,8 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); mod tests { use hex_literal::hex; use pageserver_api::models::ShardParameters; - use pageserver_api::shard::ShardStripeSize; use utils::id::TimelineId; - use utils::shard::{ShardCount, ShardNumber}; + use utils::shard::{ShardCount, ShardNumber, ShardStripeSize}; use super::*; use crate::DEFAULT_PG_VERSION; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1a3016e7f1..4c8856c386 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -6161,11 +6161,11 @@ mod tests { use pageserver_api::keyspace::KeySpaceRandomAccum; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease}; use pageserver_compaction::helpers::overlaps_with; + use rand::Rng; #[cfg(feature = "testing")] use rand::SeedableRng; #[cfg(feature = "testing")] use rand::rngs::StdRng; - use rand::{Rng, thread_rng}; #[cfg(feature = "testing")] use std::ops::Range; use storage_layer::{IoConcurrency, PersistentLayerKey}; @@ -6286,8 +6286,8 @@ mod tests { while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { - let gap = random.gen_range(1..=100) <= spec.gap_chance; - let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + let gap = random.random_range(1..=100) <= spec.gap_chance; + let will_init = random.random_range(1..=100) <= spec.will_init_chance; if gap { continue; @@ -6330,8 +6330,8 @@ mod tests { while lsn < lsn_range.end { let mut key = key_range.start; while key < key_range.end { - let gap = random.gen_range(1..=100) <= spec.gap_chance; - let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + let gap = random.random_range(1..=100) <= spec.gap_chance; + let will_init = random.random_range(1..=100) <= spec.will_init_chance; if gap { continue; @@ -7808,7 +7808,7 @@ mod tests { for _ in 0..50 { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); - let blknum = thread_rng().gen_range(0..NUM_KEYS); + let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer @@ -7897,7 +7897,7 @@ mod tests { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); - let blknum = thread_rng().gen_range(0..NUM_KEYS); + let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer @@ -7965,7 +7965,7 @@ mod tests { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); - let blknum = thread_rng().gen_range(0..NUM_KEYS); + let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = blknum as u32; let mut writer = tline.writer().await; writer @@ -8229,7 +8229,7 @@ mod tests { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); - let blknum = thread_rng().gen_range(0..NUM_KEYS); + let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer @@ -8502,7 +8502,7 @@ mod tests { for iter in 1..=10 { for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); - let blknum = thread_rng().gen_range(0..NUM_KEYS); + let blknum = rand::rng().random_range(0..NUM_KEYS); test_key.field6 = (blknum * STEP) as u32; let mut writer = tline.writer().await; writer @@ -9216,7 +9216,11 @@ mod tests { let cancel = CancellationToken::new(); tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); @@ -9299,7 +9303,11 @@ mod tests { guard.cutoffs.space = Lsn(0x40); } tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); @@ -9836,7 +9844,11 @@ mod tests { let cancel = CancellationToken::new(); tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); @@ -9871,7 +9883,11 @@ mod tests { guard.cutoffs.space = Lsn(0x40); } tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); @@ -10446,7 +10462,7 @@ mod tests { &cancel, CompactOptions { flags: dryrun_flags, - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -10457,14 +10473,22 @@ mod tests { verify_result().await; tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; // compact again tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; @@ -10483,14 +10507,22 @@ mod tests { guard.cutoffs.space = Lsn(0x38); } tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result // not increasing the GC horizon and compact again tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; @@ -10695,7 +10727,7 @@ mod tests { &cancel, CompactOptions { flags: dryrun_flags, - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -10706,14 +10738,22 @@ mod tests { verify_result().await; tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; // compact again tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; @@ -10913,7 +10953,11 @@ mod tests { let cancel = CancellationToken::new(); branch_tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); @@ -10926,7 +10970,7 @@ mod tests { &cancel, CompactOptions { compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -11247,10 +11291,10 @@ mod tests { #[cfg(feature = "testing")] #[tokio::test] async fn test_read_path() -> anyhow::Result<()> { - use rand::seq::SliceRandom; + use rand::seq::IndexedRandom; let seed = if cfg!(feature = "fuzz-read-path") { - let seed: u64 = thread_rng().r#gen(); + let seed: u64 = rand::rng().random(); seed } else { // Use a hard-coded seed when not in fuzzing mode. @@ -11264,8 +11308,8 @@ mod tests { let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") { const QUERIES: u64 = 5000; - let will_init_chance: u8 = random.gen_range(0..=10); - let gap_chance: u8 = random.gen_range(0..=50); + let will_init_chance: u8 = random.random_range(0..=10); + let gap_chance: u8 = random.random_range(0..=50); (QUERIES, will_init_chance, gap_chance) } else { @@ -11366,7 +11410,8 @@ mod tests { while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); - let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE)); + let mut selected_key = + start_key.add(random.random_range(0..KEY_DIMENSION_SIZE)); while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { if used_keys.contains(&selected_key) @@ -11381,7 +11426,7 @@ mod tests { .add_key(selected_key); used_keys.insert(selected_key); - let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE; + let pick_next = random.random_range(0..=100) <= PICK_NEXT_CHANCE; if pick_next { selected_key = selected_key.next(); } else { @@ -11594,7 +11639,7 @@ mod tests { CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(0)..get_key(2)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -11641,7 +11686,7 @@ mod tests { CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(2)..get_key(4)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -11693,7 +11738,7 @@ mod tests { CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(4)..get_key(9)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -11744,7 +11789,7 @@ mod tests { CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(9)..get_key(10)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -11800,7 +11845,7 @@ mod tests { CompactOptions { flags: EnumSet::new(), compact_key_range: Some((get_key(0)..get_key(10)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -12071,7 +12116,7 @@ mod tests { &cancel, CompactOptions { compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -12106,7 +12151,11 @@ mod tests { // compact again tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; @@ -12325,7 +12374,7 @@ mod tests { CompactOptions { compact_key_range: Some((get_key(0)..get_key(2)).into()), compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -12371,7 +12420,7 @@ mod tests { CompactOptions { compact_key_range: Some((get_key(3)..get_key(8)).into()), compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -12419,7 +12468,7 @@ mod tests { CompactOptions { compact_key_range: Some((get_key(0)..get_key(5)).into()), compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()), - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) @@ -12454,7 +12503,11 @@ mod tests { // final full compaction tline - .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .compact_with_gc( + &cancel, + CompactOptions::default_for_gc_compaction_unit_tests(), + &ctx, + ) .await .unwrap(); verify_result().await; @@ -12564,7 +12617,7 @@ mod tests { CompactOptions { compact_key_range: None, compact_lsn_range: None, - ..Default::default() + ..CompactOptions::default_for_gc_compaction_unit_tests() }, &ctx, ) diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index ed541c4f12..29320f088c 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -535,8 +535,8 @@ pub(crate) mod tests { } pub(crate) fn random_array(len: usize) -> Vec { - let mut rng = rand::thread_rng(); - (0..len).map(|_| rng.r#gen()).collect::<_>() + let mut rng = rand::rng(); + (0..len).map(|_| rng.random()).collect::<_>() } #[tokio::test] @@ -588,9 +588,9 @@ pub(crate) mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { - let mut sz: u16 = rng.r#gen(); + let mut sz: u16 = rng.random(); // Make 50% of the arrays small - if rng.r#gen() { + if rng.random() { sz &= 63; } random_array(sz.into()) diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 419befa41b..40f405307c 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -1090,7 +1090,7 @@ pub(crate) mod tests { const NUM_KEYS: usize = 100000; let mut all_data: BTreeMap = BTreeMap::new(); for idx in 0..NUM_KEYS { - let u: f64 = rand::thread_rng().gen_range(0.0..1.0); + let u: f64 = rand::rng().random_range(0.0..1.0); let t = -(f64::ln(u)); let key_int = (t * 1000000.0) as u128; @@ -1116,7 +1116,7 @@ pub(crate) mod tests { // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { - let key_int = rand::thread_rng().r#gen::(); + let key_int = rand::rng().random::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 203b5bf592..f2be129090 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -508,8 +508,8 @@ mod tests { let write_nbytes = cap * 2 + cap / 2; - let content: Vec = rand::thread_rng() - .sample_iter(rand::distributions::Standard) + let content: Vec = rand::rng() + .sample_iter(rand::distr::StandardUniform) .take(write_nbytes) .collect(); @@ -565,8 +565,8 @@ mod tests { let cap = writer.mutable().capacity(); drop(writer); - let content: Vec = rand::thread_rng() - .sample_iter(rand::distributions::Standard) + let content: Vec = rand::rng() + .sample_iter(rand::distr::StandardUniform) .take(cap * 2 + cap / 2) .collect(); @@ -614,8 +614,8 @@ mod tests { let cap = mutable.capacity(); let align = mutable.align(); drop(writer); - let content: Vec = rand::thread_rng() - .sample_iter(rand::distributions::Standard) + let content: Vec = rand::rng() + .sample_iter(rand::distr::StandardUniform) .take(cap * 2 + cap / 2) .collect(); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 52f67abde5..b47bab16d8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -19,7 +19,7 @@ use pageserver_api::shard::{ }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::Rng; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use remote_storage::TimeoutOrCancel; use sysinfo::SystemExt; use tokio::fs; @@ -218,7 +218,7 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result TenantStartupMode::Attached(( alc.attach_mode, alc.generation, - ShardStripeSize::default(), + lc.shard.stripe_size, )), LocationMode::Secondary(_) => TenantStartupMode::Secondary, }, @@ -352,7 +352,8 @@ async fn init_load_generations( let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - match client.re_attach(conf).await { + let empty_local_disk = tenant_confs.is_empty(); + match client.re_attach(conf, empty_local_disk).await { Ok(tenants) => tenants .into_iter() .flat_map(|(id, rart)| { diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 7dba4508e2..41e9647d8f 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,8 +1,8 @@ use chrono::NaiveDateTime; -use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; +use utils::shard::ShardStripeSize; /// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant /// shard-wide information that must be persisted in remote storage. diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 62ca527bbc..8dc1d57b5d 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -25,7 +25,7 @@ pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { if d == Duration::ZERO { d } else { - rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) + rand::rng().random_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) } } @@ -35,7 +35,7 @@ pub(super) fn period_warmup(period: Duration) -> Duration { if period == Duration::ZERO { period } else { - rand::thread_rng().gen_range(Duration::ZERO..period) + rand::rng().random_range(Duration::ZERO..period) } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index c2f76c859c..f963fdac92 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1634,7 +1634,8 @@ pub(crate) mod test { use bytes::Bytes; use itertools::MinMaxResult; use postgres_ffi::PgMajorVersion; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::prelude::{SeedableRng, StdRng}; + use rand::seq::IndexedRandom; use rand::{Rng, RngCore}; /// Construct an index for a fictional delta layer and and then @@ -1788,14 +1789,14 @@ pub(crate) mod test { let mut entries = Vec::new(); for _ in 0..constants::KEY_COUNT { - let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY); + let count = rng.random_range(1..constants::MAX_ENTRIES_PER_KEY); let mut lsns_iter = std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { Some(Lsn(lsn.0 + 0x08)) }); let mut lsns = Vec::new(); while lsns.len() < count as usize { - let take = rng.gen_bool(0.5); + let take = rng.random_bool(0.5); let lsn = lsns_iter.next().unwrap(); if take { lsns.push(lsn); @@ -1869,12 +1870,13 @@ pub(crate) mod test { for _ in 0..constants::RANGES_COUNT { let mut range: Option> = Option::default(); while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { - let range_start = rng.gen_range(start..end); + let range_start = rng.random_range(start..end); let range_end_offset = range_start + constants::MIN_RANGE_SIZE; if range_end_offset >= end { range = Some(Key::from_i128(range_start)..Key::from_i128(end)); } else { - let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end); + let range_end = + rng.random_range((range_start + constants::MIN_RANGE_SIZE)..end); range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 27fbc6f5fb..84f4386087 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -440,8 +440,8 @@ mod tests { impl InMemoryFile { fn new_random(len: usize) -> Self { Self { - content: rand::thread_rng() - .sample_iter(rand::distributions::Standard) + content: rand::rng() + .sample_iter(rand::distr::StandardUniform) .take(len) .collect(), } @@ -498,7 +498,7 @@ mod tests { len } }; - rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs + rand::Rng::fill(&mut rand::rng(), &mut dst_slice[nread..]); // to discover bugs Ok((dst, nread)) } } @@ -763,7 +763,7 @@ mod tests { let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len()); let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); dst_slice[..len].copy_from_slice(&mocked_bytes[..len]); - rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs + rand::Rng::fill(&mut rand::rng(), &mut dst_slice[len..]); // to discover bugs Ok((dst, len)) } Err(e) => Err(std::io::Error::other(e)), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 08fc7d61a5..676b39e55b 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -515,7 +515,7 @@ pub(crate) async fn sleep_random_range( interval: RangeInclusive, cancel: &CancellationToken, ) -> Result { - let delay = rand::thread_rng().gen_range(interval); + let delay = rand::rng().random_range(interval); if delay == Duration::ZERO { return Ok(delay); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 73d2d72b59..3ef07aa414 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -287,7 +287,7 @@ pub struct Timeline { ancestor_lsn: Lsn, // The LSN of gc-compaction that was last applied to this timeline. - gc_compaction_state: ArcSwap>, + gc_compaction_state: ArcSwapOption, pub(crate) metrics: Arc, @@ -448,7 +448,11 @@ pub struct Timeline { /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, + #[expect(dead_code)] feature_resolver: Arc, + + /// Basebackup will collect the count and store it here. Used for reldirv2 rollout. + pub(crate) db_rel_count: ArcSwapOption<(usize, usize)>, } pub(crate) enum PreviousHeatmap { @@ -939,6 +943,20 @@ pub(crate) struct CompactOptions { /// Set job size for the GC compaction. /// This option is only used by GC compaction. pub sub_compaction_max_job_size_mb: Option, + /// Only for GC compaction. + /// If set, the compaction will compact the metadata layers. Should be only set to true in unit tests + /// because metadata compaction is not fully supported yet. + pub gc_compaction_do_metadata_compaction: bool, +} + +impl CompactOptions { + #[cfg(test)] + pub fn default_for_gc_compaction_unit_tests() -> Self { + Self { + gc_compaction_do_metadata_compaction: true, + ..Default::default() + } + } } impl std::fmt::Debug for Timeline { @@ -1310,6 +1328,9 @@ impl Timeline { /// /// This naive implementation will be replaced with a more efficient one /// which actually vectorizes the read path. + /// + /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future + /// if the client goes away (e.g. due to timeout or cancellation). pub(crate) async fn get_vectored( &self, query: VersionedKeySpaceQuery, @@ -2185,6 +2206,7 @@ impl Timeline { compact_lsn_range: None, sub_compaction: false, sub_compaction_max_job_size_mb: None, + gc_compaction_do_metadata_compaction: false, }, ctx, ) @@ -2808,7 +2830,7 @@ impl Timeline { if r.numerator == 0 { false } else { - rand::thread_rng().gen_range(0..r.denominator) < r.numerator + rand::rng().random_range(0..r.denominator) < r.numerator } } None => false, @@ -3218,7 +3240,7 @@ impl Timeline { }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), - gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)), + gc_compaction_state: ArcSwapOption::from_pointee(gc_compaction_state), last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), @@ -3323,6 +3345,8 @@ impl Timeline { basebackup_cache: resources.basebackup_cache, feature_resolver: resources.feature_resolver.clone(), + + db_rel_count: ArcSwapOption::from_pointee(None), }; result.repartition_threshold = @@ -3394,7 +3418,7 @@ impl Timeline { gc_compaction_state: GcCompactionState, ) -> anyhow::Result<()> { self.gc_compaction_state - .store(Arc::new(Some(gc_compaction_state.clone()))); + .store(Some(Arc::new(gc_compaction_state.clone()))); self.remote_client .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) } @@ -3410,7 +3434,10 @@ impl Timeline { } pub(crate) fn get_gc_compaction_state(&self) -> Option { - self.gc_compaction_state.load_full().as_ref().clone() + self.gc_compaction_state + .load() + .as_ref() + .map(|x| x.as_ref().clone()) } /// Creates and starts the wal receiver. @@ -3890,7 +3917,7 @@ impl Timeline { // 1hour base (60_i64 * 60_i64) // 10min jitter - + rand::thread_rng().gen_range(-10 * 60..10 * 60), + + rand::rng().random_range(-10 * 60..10 * 60), ) .expect("10min < 1hour"), ); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index aa1aa937b6..9bca952a46 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -396,6 +396,7 @@ impl GcCompactionQueue { }), compact_lsn_range: None, sub_compaction_max_job_size_mb: None, + gc_compaction_do_metadata_compaction: false, }, permit, ); @@ -512,6 +513,7 @@ impl GcCompactionQueue { compact_key_range: Some(job.compact_key_range.into()), compact_lsn_range: Some(job.compact_lsn_range.into()), sub_compaction_max_job_size_mb: None, + gc_compaction_do_metadata_compaction: false, }; pending_tasks.push(GcCompactionQueueItem::SubCompactionJob { options, @@ -785,6 +787,8 @@ pub(crate) struct GcCompactJob { /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`]. /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here. pub compact_lsn_range: Range, + /// See [`CompactOptions::gc_compaction_do_metadata_compaction`]. + pub do_metadata_compaction: bool, } impl GcCompactJob { @@ -799,6 +803,7 @@ impl GcCompactJob { .compact_lsn_range .map(|x| x.into()) .unwrap_or(Lsn::INVALID..Lsn::MAX), + do_metadata_compaction: options.gc_compaction_do_metadata_compaction, } } } @@ -1321,13 +1326,7 @@ impl Timeline { .max() }; - let (partition_mode, partition_lsn) = if cfg!(test) - || cfg!(feature = "testing") - || self - .feature_resolver - .evaluate_boolean("image-compaction-boundary") - .is_ok() - { + let (partition_mode, partition_lsn) = { let last_repartition_lsn = self.partitioning.read().1; let lsn = match l0_l1_boundary_lsn { Some(boundary) => gc_cutoff @@ -1343,8 +1342,6 @@ impl Timeline { } else { ("l0_l1_boundary", lsn) } - } else { - ("latest_record", self.get_last_record_lsn()) }; // 2. Repartition and create image layers if necessary @@ -3174,6 +3171,7 @@ impl Timeline { dry_run: job.dry_run, compact_key_range: start..end, compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, + do_metadata_compaction: false, }); current_start = Some(end); } @@ -3236,7 +3234,7 @@ impl Timeline { async fn compact_with_gc_inner( self: &Arc, cancel: &CancellationToken, - job: GcCompactJob, + mut job: GcCompactJob, ctx: &RequestContext, yield_for_l0: bool, ) -> Result { @@ -3244,6 +3242,28 @@ impl Timeline { // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. + // If the job is not configured to compact the metadata key range, shrink the key range + // to exclude the metadata key range. The check is done by checking if the end of the key range + // is larger than the start of the metadata key range. Note that metadata keys cover the entire + // second half of the keyspace, so it's enough to only check the end of the key range. + if !job.do_metadata_compaction + && job.compact_key_range.end > Key::metadata_key_range().start + { + tracing::info!( + "compaction for metadata key range is not supported yet, overriding compact_key_range from {} to {}", + job.compact_key_range.end, + Key::metadata_key_range().start + ); + // Shrink the key range to exclude the metadata key range. + job.compact_key_range.end = Key::metadata_key_range().start; + + // Skip the job if the key range completely lies within the metadata key range. + if job.compact_key_range.start >= job.compact_key_range.end { + tracing::info!("compact_key_range is empty, skipping compaction"); + return Ok(CompactionOutcome::Done); + } + } + let timer = Instant::now(); let begin_timer = timer; diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 7bca66190f..3570cab301 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -362,7 +362,7 @@ impl Cache { tokio::time::sleep(RETRY_BACKOFF).await; continue; } else { - tracing::warn!( + tracing::info!( "Failed to resolve tenant shard after {} attempts: {:?}", GET_MAX_RETRIES, e @@ -654,7 +654,7 @@ mod tests { use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; use pageserver_api::models::ShardParameters; use pageserver_api::reltag::RelTag; - use pageserver_api::shard::ShardStripeSize; + use pageserver_api::shard::DEFAULT_STRIPE_SIZE; use utils::shard::ShardCount; use utils::sync::gate::GateGuard; @@ -955,7 +955,7 @@ mod tests { }); let child_params = ShardParameters { count: ShardCount(2), - stripe_size: ShardStripeSize::default(), + stripe_size: DEFAULT_STRIPE_SIZE, }; let child0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index aba94244a3..f33f47a956 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -184,7 +184,7 @@ pub(super) async fn connection_manager_loop_step( // If we've not received any updates from the broker from a while, are waiting for WAL // and have no safekeeper connection or connection candidates, then it might be that - // the broker subscription is wedged. Drop the currrent subscription and re-subscribe + // the broker subscription is wedged. Drop the current subscription and re-subscribe // with the goal of unblocking it. _ = broker_reset_interval.tick() => { let awaiting_lsn = wait_lsn_status.borrow().is_some(); @@ -192,7 +192,7 @@ pub(super) async fn connection_manager_loop_step( let no_connection = connection_manager_state.wal_connection.is_none(); if awaiting_lsn && no_candidates && no_connection { - tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); + tracing::info!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; } }, diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index ccfad7a391..0dafa5c4bb 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -1,6 +1,6 @@ //! An utilization metric which is used to decide on which pageserver to put next tenant. //! -//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain its openapi spec as the //! truth. use std::path::Path; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 45b6e44c54..a7f0c5914a 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1275,8 +1275,8 @@ mod tests { use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; + use rand::Rng; use rand::seq::SliceRandom; - use rand::{Rng, thread_rng}; use super::*; use crate::context::DownloadBehavior; @@ -1358,7 +1358,7 @@ mod tests { // Check that all the other FDs still work too. Use them in random order for // good measure. - file_b_dupes.as_mut_slice().shuffle(&mut thread_rng()); + file_b_dupes.as_mut_slice().shuffle(&mut rand::rng()); for vfile in file_b_dupes.iter_mut() { assert_first_512_eq(vfile, b"content_b").await; } @@ -1413,9 +1413,8 @@ mod tests { let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { let mut buf = IoBufferMut::with_capacity_zeroed(SIZE); - let mut rng = rand::rngs::OsRng; for _ in 1..1000 { - let f = &files[rng.gen_range(0..files.len())]; + let f = &files[rand::rng().random_range(0..files.len())]; buf = f .read_exact_at(buf.slice_full(), 0, &ctx) .await diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index f852051178..3acf98b020 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -32,9 +32,10 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::*; use postgres_ffi::{ - PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, - enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants, + PgMajorVersion, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, + fsm_logical_to_physical, pg_constants, }; +use postgres_ffi_types::TimestampTz; use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use tracing::*; use utils::bin_ser::{DeserializeError, SerializeError}; @@ -1069,7 +1070,7 @@ impl WalIngest { // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that // read it, like GetNewMultiXactId(). This is different from how nextXid is - // incremented! nextXid skips over < FirstNormalTransactionId when the the value + // incremented! nextXid skips over < FirstNormalTransactionId when the value // is stored, so it's never 0 in a checkpoint. // // I don't know why it's done that way, it seems less error-prone to skip over 0 diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index bf7aeb4108..34cabaca62 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -5,6 +5,7 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ communicator.o \ + communicator_process.o \ extension_server.o \ file_cache.o \ hll.o \ @@ -29,6 +30,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S), Darwin) + SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration +endif + EXTENSION = neon DATA = \ neon--1.0.sql \ @@ -57,7 +63,8 @@ WALPROP_OBJS = \ # libcommunicator.a is built by cargo from the Rust sources under communicator/ # subdirectory. `cargo build` also generates communicator_bindings.h. -neon.o: communicator/communicator_bindings.h +communicator_process.o: communicator/communicator_bindings.h +file_cache.o: communicator/communicator_bindings.h $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &: (cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 158b8940a3..5a08b3e331 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -1820,12 +1820,12 @@ nm_to_string(NeonMessage *msg) } case T_NeonGetPageResponse: { -#if 0 NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; -#endif appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); - appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfo(&s, ", \"rinfo\": %u/%u/%u", RelFileInfoFmt(msg_resp->req.rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_resp->req.forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_resp->req.blkno); appendStringInfoChar(&s, '}'); break; } diff --git a/pgxn/neon/communicator/Cargo.toml b/pgxn/neon/communicator/Cargo.toml index e95a269d90..71cb5c7ae9 100644 --- a/pgxn/neon/communicator/Cargo.toml +++ b/pgxn/neon/communicator/Cargo.toml @@ -11,9 +11,19 @@ crate-type = ["staticlib"] # 'testing' feature is currently unused in the communicator, but we accept it for convenience of # calling build scripts, so that you can pass the same feature to all packages. testing = [] +# 'rest_broker' feature is currently unused in the communicator, but we accept it for convenience of +# calling build scripts, so that you can pass the same feature to all packages. +rest_broker = [] [dependencies] -neon-shmem.workspace = true +axum.workspace = true +http.workspace = true +tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] } +tracing.workspace = true +tracing-subscriber.workspace = true + +measured.workspace = true +utils.workspace = true workspace_hack = { version = "0.1", path = "../../../workspace_hack" } [build-dependencies] diff --git a/pgxn/neon/communicator/README.md b/pgxn/neon/communicator/README.md index 8169ae72b5..7ff4708171 100644 --- a/pgxn/neon/communicator/README.md +++ b/pgxn/neon/communicator/README.md @@ -1,7 +1,22 @@ -This package will evolve into a "compute-pageserver communicator" -process and machinery. For now, it's just a dummy that doesn't do -anything interesting, but it allows us to test the compilation and -linking of Rust code into the Postgres extensions. +# Communicator + +This package provides the so-called "compute-pageserver communicator", +or just "communicator" in short. The communicator is a separate +background worker process that runs in the PostgreSQL server. It's +part of the neon extension. Currently, it only provides an HTTP +endpoint for metrics, but in the future it will evolve to handle all +communications with the pageservers. + +## Source code view + +pgxn/neon/communicator_process.c + Contains code needed to start up the communicator process, and + the glue that interacts with PostgreSQL code and the Rust + code in the communicator process. + + +pgxn/neon/communicator/src/worker_process/ + Worker process main loop and glue code At compilation time, pgxn/neon/communicator/ produces a static library, libcommunicator.a. It is linked to the neon.so extension diff --git a/pgxn/neon/communicator/src/lib.rs b/pgxn/neon/communicator/src/lib.rs index 24c180d37d..9a3a46c95f 100644 --- a/pgxn/neon/communicator/src/lib.rs +++ b/pgxn/neon/communicator/src/lib.rs @@ -1,6 +1,5 @@ -/// dummy function, just to test linking Rust functions into the C -/// extension -#[unsafe(no_mangle)] -pub extern "C" fn communicator_dummy(arg: u32) -> u32 { - arg + 1 -} +mod worker_process; + +/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the +/// future. This is within the Postgres data directory. +const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket"; diff --git a/pgxn/neon/communicator/src/worker_process/callbacks.rs b/pgxn/neon/communicator/src/worker_process/callbacks.rs new file mode 100644 index 0000000000..70e8e12fea --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs @@ -0,0 +1,51 @@ +//! C callbacks to PostgreSQL facilities that the neon extension needs to provide. These +//! are implemented in `neon/pgxn/communicator_process.c`. The function signatures better +//! match! +//! +//! These are called from the communicator threads! Careful what you do, most Postgres +//! functions are not safe to call in that context. + +#[cfg(not(test))] +unsafe extern "C" { + pub fn callback_set_my_latch_unsafe(); + pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics; +} + +// Compile unit tests with dummy versions of the functions. Unit tests cannot call back +// into the C code. (As of this writing, no unit tests even exists in the communicator +// package, but the code coverage build still builds these and tries to link with the +// external C code.) +#[cfg(test)] +unsafe fn callback_set_my_latch_unsafe() { + panic!("not usable in unit tests"); +} +#[cfg(test)] +unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics { + panic!("not usable in unit tests"); +} + +// safe wrappers + +pub(super) fn callback_set_my_latch() { + unsafe { callback_set_my_latch_unsafe() }; +} + +pub(super) fn callback_get_lfc_metrics() -> LfcMetrics { + unsafe { callback_get_lfc_metrics_unsafe() } +} + +/// Return type of the callback_get_lfc_metrics() function. +#[repr(C)] +pub struct LfcMetrics { + pub lfc_cache_size_limit: i64, + pub lfc_hits: i64, + pub lfc_misses: i64, + pub lfc_used: i64, + pub lfc_writes: i64, + + // working set size looking back 1..60 minutes. + // + // Index 0 is the size of the working set accessed within last 1 minute, + // index 59 is the size of the working set accessed within last 60 minutes. + pub lfc_approximate_working_set_size_windows: [i64; 60], +} diff --git a/pgxn/neon/communicator/src/worker_process/control_socket.rs b/pgxn/neon/communicator/src/worker_process/control_socket.rs new file mode 100644 index 0000000000..ef9d1f1529 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/control_socket.rs @@ -0,0 +1,102 @@ +//! Communicator control socket. +//! +//! Currently, the control socket is used to provide information about the communicator +//! process, file cache etc. as prometheus metrics. In the future, it can be used to +//! expose more things. +//! +//! The exporter speaks HTTP, listens on a Unix Domain Socket under the Postgres +//! data directory. For debugging, you can access it with curl: +//! +//! ```sh +//! curl --unix-socket neon-communicator.socket http://localhost/metrics +//! ``` +//! +use axum::Router; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use http::StatusCode; +use http::header::CONTENT_TYPE; + +use measured::MetricGroup; +use measured::text::BufferedTextEncoder; + +use std::io::ErrorKind; + +use tokio::net::UnixListener; + +use crate::NEON_COMMUNICATOR_SOCKET_NAME; +use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct; + +impl CommunicatorWorkerProcessStruct { + /// Launch the listener + pub(crate) async fn launch_control_socket_listener( + &'static self, + ) -> Result<(), std::io::Error> { + use axum::routing::get; + let app = Router::new() + .route("/metrics", get(get_metrics)) + .route("/autoscaling_metrics", get(get_autoscaling_metrics)) + .route("/debug/panic", get(handle_debug_panic)) + .with_state(self); + + // If the server is restarted, there might be an old socket still + // lying around. Remove it first. + match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) { + Ok(()) => { + tracing::warn!("removed stale control socket"); + } + Err(e) if e.kind() == ErrorKind::NotFound => {} + Err(e) => { + tracing::error!("could not remove stale control socket: {e:#}"); + // Try to proceed anyway. It will likely fail below though. + } + }; + + // Create the unix domain socket and start listening on it + let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?; + + tokio::spawn(async { + tracing::info!("control socket listener spawned"); + axum::serve(listener, app) + .await + .expect("axum::serve never returns") + }); + + Ok(()) + } +} + +/// Expose all Prometheus metrics. +async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response { + tracing::trace!("/metrics requested"); + metrics_to_response(&state).await +} + +/// Expose Prometheus metrics, for use by the autoscaling agent. +/// +/// This is a subset of all the metrics. +async fn get_autoscaling_metrics( + State(state): State<&CommunicatorWorkerProcessStruct>, +) -> Response { + tracing::trace!("/metrics requested"); + metrics_to_response(&state.lfc_metrics).await +} + +async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response { + panic!("test HTTP handler task panic"); +} + +/// Helper function to convert prometheus metrics to a text response +async fn metrics_to_response(metrics: &(dyn MetricGroup + Sync)) -> Response { + let mut enc = BufferedTextEncoder::new(); + metrics + .collect_group_into(&mut enc) + .unwrap_or_else(|never| match never {}); + + Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, "application/text") + .body(Body::from(enc.finish())) + .unwrap() +} diff --git a/pgxn/neon/communicator/src/worker_process/lfc_metrics.rs b/pgxn/neon/communicator/src/worker_process/lfc_metrics.rs new file mode 100644 index 0000000000..fcb291c71b --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/lfc_metrics.rs @@ -0,0 +1,83 @@ +use measured::{ + FixedCardinalityLabel, Gauge, GaugeVec, LabelGroup, MetricGroup, + label::{LabelName, LabelValue, StaticLabelSet}, + metric::{MetricEncoding, gauge::GaugeState, group::Encoding}, +}; + +use super::callbacks::callback_get_lfc_metrics; + +pub(crate) struct LfcMetricsCollector; + +#[derive(MetricGroup)] +#[metric(new())] +struct LfcMetricsGroup { + /// LFC cache size limit in bytes + lfc_cache_size_limit: Gauge, + /// LFC cache hits + lfc_hits: Gauge, + /// LFC cache misses + lfc_misses: Gauge, + /// LFC chunks used (chunk = 1MB) + lfc_used: Gauge, + /// LFC cache writes + lfc_writes: Gauge, + /// Approximate working set size in pages of 8192 bytes + #[metric(init = GaugeVec::dense())] + lfc_approximate_working_set_size_windows: GaugeVec>, +} + +impl MetricGroup for LfcMetricsCollector +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), ::Err> { + let g = LfcMetricsGroup::new(); + + let lfc_metrics = callback_get_lfc_metrics(); + + g.lfc_cache_size_limit.set(lfc_metrics.lfc_cache_size_limit); + g.lfc_hits.set(lfc_metrics.lfc_hits); + g.lfc_misses.set(lfc_metrics.lfc_misses); + g.lfc_used.set(lfc_metrics.lfc_used); + g.lfc_writes.set(lfc_metrics.lfc_writes); + + for i in 0..60 { + let val = lfc_metrics.lfc_approximate_working_set_size_windows[i]; + g.lfc_approximate_working_set_size_windows + .set(MinuteAsSeconds(i), val); + } + + g.collect_group_into(enc) + } +} + +/// This stores the values in range 0..60, +/// encodes them as seconds (60, 120, 180, ..., 3600) +#[derive(Clone, Copy)] +struct MinuteAsSeconds(usize); + +impl FixedCardinalityLabel for MinuteAsSeconds { + fn cardinality() -> usize { + 60 + } + + fn encode(&self) -> usize { + self.0 + } + + fn decode(value: usize) -> Self { + Self(value) + } +} + +impl LabelValue for MinuteAsSeconds { + fn visit(&self, v: V) -> V::Output { + v.write_int((self.0 + 1) as i64 * 60) + } +} + +impl LabelGroup for MinuteAsSeconds { + fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { + v.write_value(LabelName::from_str("duration_seconds"), self); + } +} diff --git a/pgxn/neon/communicator/src/worker_process/logging.rs b/pgxn/neon/communicator/src/worker_process/logging.rs new file mode 100644 index 0000000000..1ae31cd0dd --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/logging.rs @@ -0,0 +1,250 @@ +//! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log +//! +//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres +//! process latch is raised. That wakes up the loop in the main thread, see +//! `communicator_new_bgworker_main()`. It reads the message from the channel and +//! ereport()s it. This ensures that only one thread, the main thread, calls the +//! PostgreSQL logging routines at any time. + +use std::ffi::c_char; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::mpsc::sync_channel; +use std::sync::mpsc::{Receiver, SyncSender}; +use std::sync::mpsc::{TryRecvError, TrySendError}; + +use tracing::info; +use tracing::{Event, Level, Metadata, Subscriber}; +use tracing_subscriber::filter::LevelFilter; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::{FmtContext, FormatEvent, FormatFields, FormattedFields, MakeWriter}; +use tracing_subscriber::registry::LookupSpan; + +use crate::worker_process::callbacks::callback_set_my_latch; + +/// This handle is passed to the C code, and used by [`communicator_worker_poll_logging`] +pub struct LoggingReceiver { + receiver: Receiver, +} + +/// This is passed to `tracing` +struct LoggingSender { + sender: SyncSender, +} + +static DROPPED_EVENT_COUNT: AtomicU64 = AtomicU64::new(0); + +/// Called once, at worker process startup. The returned LoggingState is passed back +/// in the subsequent calls to `pump_logging`. It is opaque to the C code. +#[unsafe(no_mangle)] +pub extern "C" fn communicator_worker_configure_logging() -> Box { + let (sender, receiver) = sync_channel(1000); + + let receiver = LoggingReceiver { receiver }; + let sender = LoggingSender { sender }; + + use tracing_subscriber::prelude::*; + let r = tracing_subscriber::registry(); + + let r = r.with( + tracing_subscriber::fmt::layer() + .with_ansi(false) + .event_format(SimpleFormatter) + .with_writer(sender) + // TODO: derive this from log_min_messages? Currently the code in + // communicator_process.c forces log_min_messages='INFO'. + .with_filter(LevelFilter::from_level(Level::INFO)), + ); + r.init(); + + info!("communicator process logging started"); + + Box::new(receiver) +} + +/// Read one message from the logging queue. This is essentially a wrapper to Receiver, +/// with a C-friendly signature. +/// +/// The message is copied into *errbuf, which is a caller-supplied buffer of size +/// `errbuf_len`. If the message doesn't fit in the buffer, it is truncated. It is always +/// NULL-terminated. +/// +/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see +/// elog.h +/// +/// If there was a message, *dropped_event_count_p is also updated with a counter of how +/// many log messages in total has been dropped. By comparing that with the value from +/// previous call, you can tell how many were dropped since last call. +/// +/// Returns: +/// +/// 0 if there were no messages +/// 1 if there was a message. The message and its level are returned in +/// *errbuf and *elevel_p. *dropped_event_count_p is also updated. +/// -1 on error, i.e the other end of the queue was disconnected +#[unsafe(no_mangle)] +pub extern "C" fn communicator_worker_poll_logging( + state: &mut LoggingReceiver, + errbuf: *mut c_char, + errbuf_len: u32, + elevel_p: &mut i32, + dropped_event_count_p: &mut u64, +) -> i32 { + let msg = match state.receiver.try_recv() { + Err(TryRecvError::Empty) => return 0, + Err(TryRecvError::Disconnected) => return -1, + Ok(msg) => msg, + }; + + let src: &[u8] = &msg.message; + let dst: *mut u8 = errbuf.cast(); + let len = std::cmp::min(src.len(), errbuf_len as usize - 1); + unsafe { + std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len); + *(dst.add(len)) = b'\0'; // NULL terminator + } + + // Map the tracing Level to PostgreSQL elevel. + // + // XXX: These levels are copied from PostgreSQL's elog.h. Introduce another enum to + // hide these? + *elevel_p = match msg.level { + Level::TRACE => 10, // DEBUG5 + Level::DEBUG => 14, // DEBUG1 + Level::INFO => 17, // INFO + Level::WARN => 19, // WARNING + Level::ERROR => 21, // ERROR + }; + + *dropped_event_count_p = DROPPED_EVENT_COUNT.load(Ordering::Relaxed); + + 1 +} + +//---- The following functions can be called from any thread ---- + +#[derive(Clone)] +struct FormattedEventWithMeta { + message: Vec, + level: tracing::Level, +} + +impl Default for FormattedEventWithMeta { + fn default() -> Self { + FormattedEventWithMeta { + message: Vec::new(), + level: tracing::Level::DEBUG, + } + } +} + +struct EventBuilder<'a> { + event: FormattedEventWithMeta, + + sender: &'a LoggingSender, +} + +impl std::io::Write for EventBuilder<'_> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.event.message.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.sender.send_event(self.event.clone()); + Ok(()) + } +} + +impl Drop for EventBuilder<'_> { + fn drop(&mut self) { + let sender = self.sender; + let event = std::mem::take(&mut self.event); + + sender.send_event(event); + } +} + +impl<'a> MakeWriter<'a> for LoggingSender { + type Writer = EventBuilder<'a>; + + fn make_writer(&'a self) -> Self::Writer { + panic!("not expected to be called when make_writer_for is implemented"); + } + + fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer { + EventBuilder { + event: FormattedEventWithMeta { + message: Vec::new(), + level: *meta.level(), + }, + sender: self, + } + } +} + +impl LoggingSender { + fn send_event(&self, e: FormattedEventWithMeta) { + match self.sender.try_send(e) { + Ok(()) => { + // notify the main thread + callback_set_my_latch(); + } + Err(TrySendError::Disconnected(_)) => {} + Err(TrySendError::Full(_)) => { + // The queue is full, cannot send any more. To avoid blocking the tokio + // thread, simply drop the message. Better to lose some logs than get + // stuck if there's a problem with the logging. + // + // Record the fact that was a message was dropped by incrementing the + // counter. + DROPPED_EVENT_COUNT.fetch_add(1, Ordering::Relaxed); + } + } + } +} + +/// Simple formatter implementation for tracing_subscriber, which prints the log spans and +/// message part like the default formatter, but no timestamp or error level. The error +/// level is captured separately by `FormattedEventWithMeta', and when the error is +/// printed by the main thread, with PostgreSQL ereport(), it gets a timestamp at that +/// point. (The timestamp printed will therefore lag behind the timestamp on the event +/// here, if the main thread doesn't process the log message promptly) +struct SimpleFormatter; + +impl FormatEvent for SimpleFormatter +where + S: Subscriber + for<'a> LookupSpan<'a>, + N: for<'a> FormatFields<'a> + 'static, +{ + fn format_event( + &self, + ctx: &FmtContext<'_, S, N>, + mut writer: Writer<'_>, + event: &Event<'_>, + ) -> std::fmt::Result { + // Format all the spans in the event's span context. + if let Some(scope) = ctx.event_scope() { + for span in scope.from_root() { + write!(writer, "{}", span.name())?; + + // `FormattedFields` is a formatted representation of the span's fields, + // which is stored in its extensions by the `fmt` layer's `new_span` + // method. The fields will have been formatted by the same field formatter + // that's provided to the event formatter in the `FmtContext`. + let ext = span.extensions(); + let fields = &ext + .get::>() + .expect("will never be `None`"); + + // Skip formatting the fields if the span had no fields. + if !fields.is_empty() { + write!(writer, "{{{fields}}}")?; + } + write!(writer, ": ")?; + } + } + + // Write fields on the event + ctx.field_format().format_fields(writer.by_ref(), event)?; + + Ok(()) + } +} diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs new file mode 100644 index 0000000000..3147a3de63 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs @@ -0,0 +1,66 @@ +use std::str::FromStr as _; + +use crate::worker_process::lfc_metrics::LfcMetricsCollector; + +use measured::MetricGroup; +use measured::metric::MetricEncoding; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use utils::id::{TenantId, TimelineId}; + +pub struct CommunicatorWorkerProcessStruct { + runtime: tokio::runtime::Runtime, + + /*** Metrics ***/ + pub(crate) lfc_metrics: LfcMetricsCollector, +} + +/// Launch the communicator process's Rust subsystems +pub(super) fn init( + tenant_id: Option<&str>, + timeline_id: Option<&str>, +) -> Result<&'static CommunicatorWorkerProcessStruct, String> { + // The caller validated these already + let _tenant_id = tenant_id + .map(TenantId::from_str) + .transpose() + .map_err(|e| format!("invalid tenant ID: {e}"))?; + let _timeline_id = timeline_id + .map(TimelineId::from_str) + .transpose() + .map_err(|e| format!("invalid timeline ID: {e}"))?; + + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .thread_name("communicator thread") + .build() + .unwrap(); + + let worker_struct = CommunicatorWorkerProcessStruct { + // Note: it's important to not drop the runtime, or all the tasks are dropped + // too. Including it in the returned struct is one way to keep it around. + runtime, + + // metrics + lfc_metrics: LfcMetricsCollector, + }; + let worker_struct = Box::leak(Box::new(worker_struct)); + + // Start the listener on the control socket + worker_struct + .runtime + .block_on(worker_struct.launch_control_socket_listener()) + .map_err(|e| e.to_string())?; + + Ok(worker_struct) +} + +impl MetricGroup for CommunicatorWorkerProcessStruct +where + T: Encoding, + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + self.lfc_metrics.collect_group_into(enc) + } +} diff --git a/pgxn/neon/communicator/src/worker_process/mod.rs b/pgxn/neon/communicator/src/worker_process/mod.rs new file mode 100644 index 0000000000..3602686779 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/mod.rs @@ -0,0 +1,13 @@ +//! This code runs in the communicator worker process. This provides +//! the glue code to: +//! +//! - launch the main loop, +//! - receive IO requests from backends and process them, +//! - write results back to backends. + +mod callbacks; +mod control_socket; +mod lfc_metrics; +mod logging; +mod main_loop; +mod worker_interface; diff --git a/pgxn/neon/communicator/src/worker_process/worker_interface.rs b/pgxn/neon/communicator/src/worker_process/worker_interface.rs new file mode 100644 index 0000000000..1dfd6820d3 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs @@ -0,0 +1,60 @@ +//! Functions called from the C code in the worker process + +use std::ffi::{CStr, CString, c_char}; + +use crate::worker_process::main_loop; +use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct; + +/// Launch the communicator's tokio tasks, which do most of the work. +/// +/// The caller has initialized the process as a regular PostgreSQL background worker +/// process. +/// +/// Inputs: +/// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode, +/// where we use local storage instead of connecting to remote neon storage. That's +/// currently only used in some unit tests. +/// +/// Result: +/// Returns pointer to CommunicatorWorkerProcessStruct, which is a handle to running +/// Rust tasks. The C code can use it to interact with the Rust parts. On failure, returns +/// None/NULL, and an error message is returned in *error_p +/// +/// This is called only once in the process, so the returned struct, and error message in +/// case of failure, are simply leaked. +#[unsafe(no_mangle)] +pub extern "C" fn communicator_worker_launch( + tenant_id: *const c_char, + timeline_id: *const c_char, + error_p: *mut *const c_char, +) -> Option<&'static CommunicatorWorkerProcessStruct> { + // Convert the arguments into more convenient Rust types + let tenant_id = if tenant_id.is_null() { + None + } else { + let cstr = unsafe { CStr::from_ptr(tenant_id) }; + Some(cstr.to_str().expect("assume UTF-8")) + }; + let timeline_id = if timeline_id.is_null() { + None + } else { + let cstr = unsafe { CStr::from_ptr(timeline_id) }; + Some(cstr.to_str().expect("assume UTF-8")) + }; + + // The `init` function does all the work. + let result = main_loop::init(tenant_id, timeline_id); + + // On failure, return the error message to the C caller in *error_p. + match result { + Ok(worker_struct) => Some(worker_struct), + Err(errmsg) => { + let errmsg = CString::new(errmsg).expect("no nuls within error message"); + let errmsg = Box::leak(errmsg.into_boxed_c_str()); + let p: *const c_char = errmsg.as_ptr(); + + unsafe { *error_p = p }; + None + } + } +} diff --git a/pgxn/neon/communicator_process.c b/pgxn/neon/communicator_process.c new file mode 100644 index 0000000000..fc734ce85b --- /dev/null +++ b/pgxn/neon/communicator_process.c @@ -0,0 +1,273 @@ +/*------------------------------------------------------------------------- + * + * communicator_process.c + * Functions for starting up the communicator background worker process. + * + * Currently, the communicator process only functions as a metrics + * exporter. It provides an HTTP endpoint for polling a limited set of + * metrics. TODO: In the future, it will do much more, i.e. handle all + * the communications with the pageservers. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/walsender.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" +#include "utils/timestamp.h" + +#include "communicator_process.h" +#include "file_cache.h" +#include "neon.h" +#include "neon_perf_counters.h" + +/* the rust bindings, generated by cbindgen */ +#include "communicator/communicator_bindings.h" + +static void pump_logging(struct LoggingReceiver *logging); +PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg); + +/**** Initialization functions. These run in postmaster ****/ + +void +pg_init_communicator_process(void) +{ + BackgroundWorker bgw; + + /* Initialize the background worker process */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_PostmasterStart; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +/**** Worker process functions. These run in the communicator worker process ****/ + +/* + * Entry point for the communicator bgworker process + */ +void +communicator_new_bgworker_main(Datum main_arg) +{ + struct LoggingReceiver *logging; + const char *errmsg = NULL; + const struct CommunicatorWorkerProcessStruct *proc_handle; + + /* + * Pretend that this process is a WAL sender. That affects the shutdown + * sequence: WAL senders are shut down last, after the final checkpoint + * has been written. That's what we want for the communicator process too. + */ + am_walsender = true; + MarkPostmasterChildWalSender(); + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + /* + * Postmaster sends us SIGUSR2 when all regular backends and bgworkers + * have exited, and it's time for us to exit too + */ + pqsignal(SIGUSR2, die); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + /* + * By default, INFO messages are not printed to the log. We want + * `tracing::info!` messages emitted from the communicator to be printed, + * however, so increase the log level. + * + * XXX: This overrides any user-set value from the config file. That's not + * great, but on the other hand, there should be little reason for user to + * control the verbosity of the communicator. It's not too verbose by + * default. + */ + SetConfigOption("log_min_messages", "INFO", PGC_SUSET, PGC_S_OVERRIDE); + + logging = communicator_worker_configure_logging(); + + proc_handle = communicator_worker_launch( + neon_tenant[0] == '\0' ? NULL : neon_tenant, + neon_timeline[0] == '\0' ? NULL : neon_timeline, + &errmsg + ); + if (proc_handle == NULL) + { + /* + * Something went wrong. Before exiting, forward any log messages that + * might've been generated during the failed launch. + */ + pump_logging(logging); + + elog(PANIC, "%s", errmsg); + } + + /* + * The Rust tokio runtime has been launched, and it's running in the + * background now. This loop in the main thread handles any interactions + * we need with the rest of PostgreSQL. + * + * NB: This process is now multi-threaded! The Rust threads do not call + * into any Postgres functions, but it's not entirely clear which Postgres + * functions are safe to call from this main thread either. Be very + * careful about adding anything non-trivial here. + * + * Also note that we try to react quickly to any log messages arriving + * from the Rust thread. Be careful to not do anything too expensive here + * that might cause delays. + */ + elog(LOG, "communicator threads started"); + for (;;) + { + TimestampTz before; + long duration; + + ResetLatch(MyLatch); + + /* + * Forward any log messages from the Rust threads into the normal + * Postgres logging facility. + */ + pump_logging(logging); + + /* + * Check interrupts like system shutdown or config reload + * + * We mustn't block for too long within this loop, or we risk the log + * queue to fill up and messages to be lost. Also, even if we can keep + * up, if there's a long delay between sending a message and printing + * it to the log, the timestamps on the messages get skewed, which is + * confusing. + * + * We expect processing interrupts to happen fast enough that it's OK, + * but measure it just in case, and print a warning if it takes longer + * than 100 ms. + */ +#define LOG_SKEW_WARNING_MS 100 + before = GetCurrentTimestamp(); + + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp()); + if (duration > LOG_SKEW_WARNING_MS) + elog(WARNING, "handling interrupts took %ld ms, communicator log timestamps might be skewed", duration); + + /* + * Wait until we are woken up. The rust threads will set the latch + * when there's a log message to forward. + */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, + 0, + PG_WAIT_EXTENSION); + } +} + +static void +pump_logging(struct LoggingReceiver *logging) +{ + char errbuf[1000]; + int elevel; + int32 rc; + static uint64_t last_dropped_event_count = 0; + uint64_t dropped_event_count; + uint64_t dropped_now; + + for (;;) + { + rc = communicator_worker_poll_logging(logging, + errbuf, + sizeof(errbuf), + &elevel, + &dropped_event_count); + if (rc == 0) + { + /* nothing to do */ + break; + } + else if (rc == 1) + { + /* Because we don't want to exit on error */ + + if (message_level_is_interesting(elevel)) + { + /* + * Prevent interrupts while cleaning up. + * + * (Not sure if this is required, but all the error handlers + * in Postgres that are installed as sigsetjmp() targets do + * this, so let's follow the example) + */ + HOLD_INTERRUPTS(); + + errstart(elevel, TEXTDOMAIN); + errmsg_internal("[COMMUNICATOR] %s", errbuf); + EmitErrorReport(); + FlushErrorState(); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + } + } + else if (rc == -1) + { + elog(ERROR, "logging channel was closed unexpectedly"); + } + } + + /* + * If the queue was full at any time since the last time we reported it, + * report how many messages were lost. We do this outside the loop, so + * that if the logging system is clogged, we don't exacerbate it by + * printing lots of warnings about dropped messages. + */ + dropped_now = dropped_event_count - last_dropped_event_count; + if (dropped_now != 0) + { + elog(WARNING, "%lu communicator log messages were dropped because the log buffer was full", + (unsigned long) dropped_now); + last_dropped_event_count = dropped_event_count; + } +} + +/**** + * Callbacks from the rust code, in the communicator process. + * + * NOTE: These must be thread-safe! It's very limited which PostgreSQL + * functions you can use!!! + * + * The signatures of these need to match those in the Rust code. + */ + +void +callback_set_my_latch_unsafe(void) +{ + SetLatch(MyLatch); +} diff --git a/pgxn/neon/communicator_process.h b/pgxn/neon/communicator_process.h new file mode 100644 index 0000000000..95afc70153 --- /dev/null +++ b/pgxn/neon/communicator_process.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * communicator_process.h + * Communicator process + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef COMMUNICATOR_PROCESS_H +#define COMMUNICATOR_PROCESS_H + +extern void pg_init_communicator_process(void); + +#endif /* COMMUNICATOR_PROCESS_H */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 2c87f139af..4da6c176cd 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -52,6 +52,8 @@ #include "pagestore_client.h" #include "communicator.h" +#include "communicator/communicator_bindings.h" + #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) /* @@ -219,10 +221,6 @@ static char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static bool lfc_do_prewarm; -static shmem_startup_hook_type prev_shmem_startup_hook; -#if PG_VERSION_NUM>=150000 -static shmem_request_hook_type prev_shmem_request_hook; -#endif bool lfc_store_prefetch_result; bool lfc_prewarm_update_ws_estimation; @@ -342,18 +340,14 @@ lfc_ensure_opened(void) return true; } -static void -lfc_shmem_startup(void) +void +LfcShmemInit(void) { bool found; static HASHCTL info; - if (prev_shmem_startup_hook) - { - prev_shmem_startup_hook(); - } - - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + if (lfc_max_size <= 0) + return; lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); if (!found) @@ -398,19 +392,16 @@ lfc_shmem_startup(void) ConditionVariableInit(&lfc_ctl->cv[i]); } - LWLockRelease(AddinShmemInitLock); } -static void -lfc_shmem_request(void) +void +LfcShmemRequest(void) { -#if PG_VERSION_NUM>=150000 - if (prev_shmem_request_hook) - prev_shmem_request_hook(); -#endif - - RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE)); - RequestNamedLWLockTranche("lfc_lock", 1); + if (lfc_max_size > 0) + { + RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE)); + RequestNamedLWLockTranche("lfc_lock", 1); + } } static bool @@ -642,18 +633,6 @@ lfc_init(void) NULL, NULL, NULL); - - if (lfc_max_size == 0) - return; - - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = lfc_shmem_startup; -#if PG_VERSION_NUM>=150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = lfc_shmem_request; -#else - lfc_shmem_request(); -#endif } FileCacheState* @@ -2179,6 +2158,38 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset) return dc; } +/* + * Get metrics, for the built-in metrics exporter that's part of the communicator + * process. + * + * NB: This is called from a Rust tokio task inside the communicator process. + * Acquiring lwlocks, elog(), allocating memory or anything else non-trivial + * is strictly prohibited here! + */ +struct LfcMetrics +callback_get_lfc_metrics_unsafe(void) +{ + struct LfcMetrics result = { + .lfc_cache_size_limit = (int64) lfc_size_limit * 1024 * 1024, + .lfc_hits = lfc_ctl ? lfc_ctl->hits : 0, + .lfc_misses = lfc_ctl ? lfc_ctl->misses : 0, + .lfc_used = lfc_ctl ? lfc_ctl->used : 0, + .lfc_writes = lfc_ctl ? lfc_ctl->writes : 0, + }; + + if (lfc_ctl) + { + for (int minutes = 1; minutes <= 60; minutes++) + { + result.lfc_approximate_working_set_size_windows[minutes - 1] = + lfc_approximate_working_set_size_seconds(minutes * 60, false); + } + } + + return result; +} + + PG_FUNCTION_INFO_V1(get_local_cache_state); Datum diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 05ba6da663..caffdc9612 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -90,6 +90,7 @@ typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; size_t num_shards; + size_t stripe_size; } ShardMap; /* @@ -110,6 +111,11 @@ typedef struct * has changed since last access, and to detect and retry copying the value if * the postmaster changes the value concurrently. (Postmaster doesn't have a * PGPROC entry and therefore cannot use LWLocks.) + * + * stripe_size is now also part of ShardMap, although it is defined by separate GUC. + * Postgres doesn't provide any mechanism to enforce dependencies between GUCs, + * that it we we have to rely on order of GUC definition in config file. + * "neon.stripe_size" should be defined prior to "neon.pageserver_connstring" */ typedef struct { @@ -118,10 +124,6 @@ typedef struct ShardMap shard_map; } PagestoreShmemState; -#if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook = NULL; -#endif -static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; @@ -176,6 +178,8 @@ static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); static void pageserver_disconnect(shardno_t shard_no); static void pageserver_disconnect_shard(shardno_t shard_no); +// HADRON +shardno_t get_num_shards(void); static bool PagestoreShmemIsValid(void) @@ -234,7 +238,10 @@ ParseShardMap(const char *connstr, ShardMap *result) p = sep + 1; } if (result) + { result->num_shards = nshards; + result->stripe_size = stripe_size; + } return true; } @@ -281,6 +288,22 @@ AssignPageserverConnstring(const char *newval, void *extra) } } +/* BEGIN_HADRON */ +/** + * Return the total number of shards seen in the shard map. + */ +shardno_t get_num_shards(void) +{ + const ShardMap *shard_map; + + Assert(pagestore_shared); + shard_map = &pagestore_shared->shard_map; + + Assert(shard_map != NULL); + return shard_map->num_shards; +} +/* END_HADRON */ + /* * Get the current number of shards, and/or the connection string for a * particular shard from the shard map in shared memory. @@ -295,12 +318,13 @@ AssignPageserverConnstring(const char *newval, void *extra) * last call, terminates all existing connections to all pageservers. */ static void -load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) +load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p, size_t* stripe_size_p) { uint64 begin_update_counter; uint64 end_update_counter; ShardMap *shard_map = &pagestore_shared->shard_map; shardno_t num_shards; + size_t stripe_size; /* * Postmaster can update the shared memory values concurrently, in which @@ -315,6 +339,7 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter); num_shards = shard_map->num_shards; + stripe_size = shard_map->stripe_size; if (connstr_p && shard_no < MAX_SHARDS) strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE); pg_memory_barrier(); @@ -349,6 +374,8 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) if (num_shards_p) *num_shards_p = num_shards; + if (stripe_size_p) + *stripe_size_p = stripe_size; } #define MB (1024*1024) @@ -357,9 +384,10 @@ shardno_t get_shard_number(BufferTag *tag) { shardno_t n_shards; + size_t stripe_size; uint32 hash; - load_shard_map(0, NULL, &n_shards); + load_shard_map(0, NULL, &n_shards, &stripe_size); #if PG_MAJORVERSION_NUM < 16 hash = murmurhash32(tag->rnode.relNode); @@ -412,7 +440,7 @@ pageserver_connect(shardno_t shard_no, int elevel) * Note that connstr is used both during connection start, and when we * log the successful connection. */ - load_shard_map(shard_no, connstr, NULL); + load_shard_map(shard_no, connstr, NULL, NULL); switch (shard->state) { @@ -1284,18 +1312,12 @@ check_neon_id(char **newval, void **extra, GucSource source) return **newval == '\0' || HexDecodeString(id, *newval, 16); } -static Size -PagestoreShmemSize(void) -{ - return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize()); -} -static bool +void PagestoreShmemInit(void) { bool found; - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pagestore_shared = ShmemInitStruct("libpagestore shared state", sizeof(PagestoreShmemState), &found); @@ -1306,44 +1328,12 @@ PagestoreShmemInit(void) memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap)); AssignPageserverConnstring(page_server_connstring, NULL); } - - NeonPerfCountersShmemInit(); - - LWLockRelease(AddinShmemInitLock); - return found; } -static void -pagestore_shmem_startup_hook(void) +void +PagestoreShmemRequest(void) { - if (prev_shmem_startup_hook) - prev_shmem_startup_hook(); - - PagestoreShmemInit(); -} - -static void -pagestore_shmem_request(void) -{ -#if PG_VERSION_NUM >= 150000 - if (prev_shmem_request_hook) - prev_shmem_request_hook(); -#endif - - RequestAddinShmemSpace(PagestoreShmemSize()); -} - -static void -pagestore_prepare_shmem(void) -{ -#if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = pagestore_shmem_request; -#else - pagestore_shmem_request(); -#endif - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = pagestore_shmem_startup_hook; + RequestAddinShmemSpace(sizeof(PagestoreShmemState)); } /* @@ -1352,8 +1342,6 @@ pagestore_prepare_shmem(void) void pg_init_libpagestore(void) { - pagestore_prepare_shmem(); - DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, @@ -1504,8 +1492,6 @@ pg_init_libpagestore(void) 0, NULL, NULL, NULL); - relsize_hash_init(); - if (page_server != NULL) neon_log(ERROR, "libpagestore already loaded"); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 7b749f1080..5b9c7d600c 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -22,6 +22,7 @@ #include "replication/slot.h" #include "replication/walsender.h" #include "storage/proc.h" +#include "storage/ipc.h" #include "funcapi.h" #include "access/htup_details.h" #include "utils/builtins.h" @@ -30,6 +31,7 @@ #include "utils/guc_tables.h" #include "communicator.h" +#include "communicator_process.h" #include "extension_server.h" #include "file_cache.h" #include "neon.h" @@ -43,9 +45,6 @@ #include "storage/ipc.h" #endif -/* the rust bindings, generated by cbindgen */ -#include "communicator/communicator_bindings.h" - PG_MODULE_MAGIC; void _PG_init(void); @@ -59,11 +58,15 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags); static void neon_ExecutorEnd(QueryDesc *queryDesc); -#if PG_MAJORVERSION_NUM >= 16 static shmem_startup_hook_type prev_shmem_startup_hook; - static void neon_shmem_startup_hook(void); +static void neon_shmem_request_hook(void); + +#if PG_MAJORVERSION_NUM >= 15 +static shmem_request_hook_type prev_shmem_request_hook = NULL; #endif + + #if PG_MAJORVERSION_NUM >= 17 uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; @@ -450,19 +453,47 @@ _PG_init(void) */ #if PG_VERSION_NUM >= 160000 load_file("$libdir/neon_rmgr", false); - - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = neon_shmem_startup_hook; #endif - /* dummy call to a Rust function in the communicator library, to check that it works */ - (void) communicator_dummy(123); + /* + * Initializing a pre-loaded Postgres extension happens in three stages: + * + * 1. _PG_init() is called early at postmaster startup. In this stage, no + * shared memory has been allocated yet. Core Postgres GUCs have been + * initialized from the config files, but notably, MaxBackends has not + * calculated yet. In this stage, we must register any extension GUCs + * and can do other early initialization that doesn't depend on shared + * memory. In this stage we must also register "shmem request" and + * "shmem starutup" hooks, to be called in stages 2 and 3. + * + * 2. After MaxBackends have been calculated, the "shmem request" hooks + * are called. The hooks can reserve shared memory by calling + * RequestAddinShmemSpace and RequestNamedLWLockTranche(). The "shmem + * request hooks" are a new mechanism in Postgres v15. In v14 and + * below, you had to make those Requests in stage 1 already, which + * means they could not depend on MaxBackends. (See hack in + * NeonPerfCountersShmemRequest()) + * + * 3. After some more runtime-computed GUCs that affect the amount of + * shared memory needed have been calculated, the "shmem startup" hooks + * are called. In this stage, we allocate any shared memory, LWLocks + * and other shared resources. + * + * Here, in the 'neon' extension, we register just one shmem request hook + * and one startup hook, which call into functions in all the subsystems + * that are part of the extension. On v14, the ShmemRequest functions are + * called in stage 1, and on v15 onwards they are called in stage 2. + */ + /* Stage 1: Define GUCs, and other early intialization */ pg_init_libpagestore(); + relsize_hash_init(); lfc_init(); pg_init_walproposer(); init_lwlsncache(); + pg_init_communicator_process(); + pg_init_communicator(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; @@ -543,6 +574,15 @@ _PG_init(void) PGC_POSTMASTER, 0, NULL, NULL, NULL); + + DefineCustomStringVariable( + "neon.privileged_role_name", + "Name of the 'weak' superuser role, which we give to the users", + NULL, + &privileged_role_name, + "neon_superuser", + PGC_POSTMASTER, 0, NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the @@ -552,6 +592,22 @@ _PG_init(void) ReportSearchPath(); + /* + * Register initialization hooks for stage 2. (On v14, there's no "shmem + * request" hooks, so call the ShmemRequest functions immediately.) + */ +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = neon_shmem_request_hook; +#else + neon_shmem_request_hook(); +#endif + + /* Register hooks for stage 3 */ + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = neon_shmem_startup_hook; + + /* Other misc initialization */ prev_ExecutorStart = ExecutorStart_hook; ExecutorStart_hook = neon_ExecutorStart; prev_ExecutorEnd = ExecutorEnd_hook; @@ -637,7 +693,34 @@ approximate_working_set_size(PG_FUNCTION_ARGS) PG_RETURN_INT32(dc); } -#if PG_MAJORVERSION_NUM >= 16 +/* + * Initialization stage 2: make requests for the amount of shared memory we + * will need. + * + * For a high-level explanation of the initialization process, see _PG_init(). + */ +static void +neon_shmem_request_hook(void) +{ +#if PG_VERSION_NUM >= 150000 + if (prev_shmem_request_hook) + prev_shmem_request_hook(); +#endif + + LfcShmemRequest(); + NeonPerfCountersShmemRequest(); + PagestoreShmemRequest(); + RelsizeCacheShmemRequest(); + WalproposerShmemRequest(); + LwLsnCacheShmemRequest(); +} + + +/* + * Initialization stage 3: Initialize shared memory. + * + * For a high-level explanation of the initialization process, see _PG_init(). + */ static void neon_shmem_startup_hook(void) { @@ -645,6 +728,15 @@ neon_shmem_startup_hook(void) if (prev_shmem_startup_hook) prev_shmem_startup_hook(); + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + LfcShmemInit(); + NeonPerfCountersShmemInit(); + PagestoreShmemInit(); + RelsizeCacheShmemInit(); + WalproposerShmemInit(); + LwLsnCacheShmemInit(); + #if PG_MAJORVERSION_NUM >= 17 WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance"); WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); @@ -657,8 +749,9 @@ neon_shmem_startup_hook(void) WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO"); WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download"); #endif + + LWLockRelease(AddinShmemInitLock); } -#endif /* * ExecutorStart hook: start up tracking if needed diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 431dacb708..20c850864a 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -16,7 +16,6 @@ extern char *neon_auth_token; extern char *neon_timeline; extern char *neon_tenant; - extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; @@ -71,4 +70,19 @@ extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]); extern PGDLLEXPORT void WalProposerMain(Datum main_arg); extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); +extern void LfcShmemRequest(void); +extern void PagestoreShmemRequest(void); +extern void RelsizeCacheShmemRequest(void); +extern void WalproposerShmemRequest(void); +extern void LwLsnCacheShmemRequest(void); +extern void NeonPerfCountersShmemRequest(void); + +extern void LfcShmemInit(void); +extern void PagestoreShmemInit(void); +extern void RelsizeCacheShmemInit(void); +extern void WalproposerShmemInit(void); +extern void LwLsnCacheShmemInit(void); +extern void NeonPerfCountersShmemInit(void); + + #endif /* NEON_H */ diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c index 1f03e52c67..74a90ea4d4 100644 --- a/pgxn/neon/neon_ddl_handler.c +++ b/pgxn/neon/neon_ddl_handler.c @@ -13,7 +13,7 @@ * accumulate changes. On subtransaction commit, the top of the stack * is merged with the table below it. * - * Support event triggers for neon_superuser + * Support event triggers for {privileged_role_name} * * IDENTIFICATION * contrib/neon/neon_dll_handler.c @@ -49,6 +49,7 @@ #include "neon_ddl_handler.h" #include "neon_utils.h" +#include "neon.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; static fmgr_hook_type next_fmgr_hook = NULL; @@ -541,11 +542,11 @@ NeonXactCallback(XactEvent event, void *arg) } static bool -RoleIsNeonSuperuser(const char *role_name) +IsPrivilegedRole(const char *role_name) { Assert(role_name); - return strcmp(role_name, "neon_superuser") == 0; + return strcmp(role_name, privileged_role_name) == 0; } static void @@ -578,8 +579,9 @@ HandleCreateDb(CreatedbStmt *stmt) { const char *owner_name = defGetString(downer); - if (RoleIsNeonSuperuser(owner_name)) - elog(ERROR, "can't create a database with owner neon_superuser"); + if (IsPrivilegedRole(owner_name)) + elog(ERROR, "could not create a database with owner %s", privileged_role_name); + entry->owner = get_role_oid(owner_name, false); } else @@ -609,8 +611,9 @@ HandleAlterOwner(AlterOwnerStmt *stmt) memset(entry->old_name, 0, sizeof(entry->old_name)); new_owner = get_rolespec_name(stmt->newowner); - if (RoleIsNeonSuperuser(new_owner)) - elog(ERROR, "can't alter owner to neon_superuser"); + if (IsPrivilegedRole(new_owner)) + elog(ERROR, "could not alter owner to %s", privileged_role_name); + entry->owner = get_role_oid(new_owner, false); entry->type = Op_Set; } @@ -716,8 +719,8 @@ HandleAlterRole(AlterRoleStmt *stmt) InitRoleTableIfNeeded(); role_name = get_rolespec_name(stmt->role); - if (RoleIsNeonSuperuser(role_name) && !superuser()) - elog(ERROR, "can't ALTER neon_superuser"); + if (IsPrivilegedRole(role_name) && !superuser()) + elog(ERROR, "could not ALTER %s", privileged_role_name); dpass = NULL; foreach(option, stmt->options) @@ -831,7 +834,7 @@ HandleRename(RenameStmt *stmt) * * In vanilla only superuser can create Event Triggers. * - * We allow it for neon_superuser by temporary switching to superuser. But as + * We allow it for {privileged_role_name} by temporary switching to superuser. But as * far as event trigger can fire in superuser context we should protect * superuser from execution of arbitrary user's code. * @@ -891,7 +894,7 @@ force_noop(FmgrInfo *finfo) * Also skip executing Event Triggers when GUC neon.event_triggers has been * set to false. This might be necessary to be able to connect again after a * LOGIN Event Trigger has been installed that would prevent connections as - * neon_superuser. + * {privileged_role_name}. */ static void neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) @@ -910,24 +913,24 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) } /* - * The neon_superuser role can use the GUC neon.event_triggers to disable + * The {privileged_role_name} role can use the GUC neon.event_triggers to disable * firing Event Trigger. * * SET neon.event_triggers TO false; * - * This only applies to the neon_superuser role though, and only allows - * skipping Event Triggers owned by neon_superuser, which we check by - * proxy of the Event Trigger function being owned by neon_superuser. + * This only applies to the {privileged_role_name} role though, and only allows + * skipping Event Triggers owned by {privileged_role_name}, which we check by + * proxy of the Event Trigger function being owned by {privileged_role_name}. * - * A role that is created in role neon_superuser should be allowed to also + * A role that is created in role {privileged_role_name} should be allowed to also * benefit from the neon_event_triggers GUC, and will be considered the - * same as the neon_superuser role. + * same as the {privileged_role_name} role. */ if (event == FHET_START && !neon_event_triggers - && is_neon_superuser()) + && is_privileged_role()) { - Oid neon_superuser_oid = get_role_oid("neon_superuser", false); + Oid weak_superuser_oid = get_role_oid(privileged_role_name, false); /* Find the Function Attributes (owner Oid, security definer) */ const char *fun_owner_name = NULL; @@ -937,8 +940,8 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef); fun_owner_name = GetUserNameFromId(fun_owner, false); - if (RoleIsNeonSuperuser(fun_owner_name) - || has_privs_of_role(fun_owner, neon_superuser_oid)) + if (IsPrivilegedRole(fun_owner_name) + || has_privs_of_role(fun_owner, weak_superuser_oid)) { elog(WARNING, "Skipping Event Trigger: neon.event_triggers is false"); @@ -1149,13 +1152,13 @@ ProcessCreateEventTrigger( } /* - * Allow neon_superuser to create Event Trigger, while keeping the + * Allow {privileged_role_name} to create Event Trigger, while keeping the * ownership of the object. * * For that we give superuser membership to the role for the execution of * the command. */ - if (IsTransactionState() && is_neon_superuser()) + if (IsTransactionState() && is_privileged_role()) { /* Find the Event Trigger function Oid */ Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false); @@ -1232,7 +1235,7 @@ ProcessCreateEventTrigger( * * That way [ ALTER | DROP ] EVENT TRIGGER commands just work. */ - if (IsTransactionState() && is_neon_superuser()) + if (IsTransactionState() && is_privileged_role()) { if (!current_user_is_super) { @@ -1352,19 +1355,17 @@ NeonProcessUtility( } /* - * Only neon_superuser is granted privilege to edit neon.event_triggers GUC. + * Only {privileged_role_name} is granted privilege to edit neon.event_triggers GUC. */ static void neon_event_triggers_assign_hook(bool newval, void *extra) { - /* MyDatabaseId == InvalidOid || !OidIsValid(GetUserId()) */ - - if (IsTransactionState() && !is_neon_superuser()) + if (IsTransactionState() && !is_privileged_role()) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to set neon.event_triggers"), - errdetail("Only \"neon_superuser\" is allowed to set the GUC"))); + errdetail("Only \"%s\" is allowed to set the GUC", privileged_role_name))); } } diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index a8cfa0f825..5887c02c36 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -1,5 +1,6 @@ #include "postgres.h" +#include "neon.h" #include "neon_lwlsncache.h" #include "miscadmin.h" @@ -81,14 +82,6 @@ static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL; static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL; static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL; -static shmem_startup_hook_type prev_shmem_startup_hook; - -#if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook; -#endif - -static void shmemrequest(void); -static void shmeminit(void); static void neon_set_max_lwlsn(XLogRecPtr lsn); void @@ -99,16 +92,6 @@ init_lwlsncache(void) lwlc_register_gucs(); - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = shmeminit; - - #if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = shmemrequest; - #else - shmemrequest(); - #endif - prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook; set_lwlsn_block_range_hook = neon_set_lwlsn_block_range; prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook; @@ -124,20 +107,19 @@ init_lwlsncache(void) } -static void shmemrequest(void) { +void +LwLsnCacheShmemRequest(void) +{ Size requested_size = sizeof(LwLsnCacheCtl); - + requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry)); RequestAddinShmemSpace(requested_size); - - #if PG_VERSION_NUM >= 150000 - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - #endif } -static void shmeminit(void) { +void +LwLsnCacheShmemInit(void) +{ static HASHCTL info; bool found; if (lwlsn_cache_size > 0) @@ -157,9 +139,6 @@ static void shmeminit(void) { } dlist_init(&LwLsnCache->lastWrittenLsnLRU); LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr(); - if (prev_shmem_startup_hook) { - prev_shmem_startup_hook(); - } } /* diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index d0a3d15108..dd576e4e73 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -17,22 +17,32 @@ #include "storage/shmem.h" #include "utils/builtins.h" +#include "neon.h" #include "neon_perf_counters.h" #include "neon_pgversioncompat.h" neon_per_backend_counters *neon_per_backend_counters_shared; -Size -NeonPerfCountersShmemSize(void) +void +NeonPerfCountersShmemRequest(void) { - Size size = 0; - - size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS, - sizeof(neon_per_backend_counters))); - - return size; + Size size; +#if PG_MAJORVERSION_NUM < 15 + /* Hack: in PG14 MaxBackends is not initialized at the time of calling NeonPerfCountersShmemRequest function. + * Do it ourselves and then undo to prevent assertion failure + */ + Assert(MaxBackends == 0); /* not initialized yet */ + InitializeMaxBackends(); + size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)); + MaxBackends = 0; +#else + size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)); +#endif + RequestAddinShmemSpace(size); } + + void NeonPerfCountersShmemInit(void) { diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index c7574ef0f9..3ab8d3e5f5 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -72,22 +72,21 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, (tag).rnode = (rinfo); \ } while (false) -#define BufTagGetNRelFileInfo(tag) tag.rnode +#define BufTagGetNRelFileInfo(tag) (tag).rnode #define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) -#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \ +#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \ do { \ - RelFileNode rnode = { .spcNode = spcOid, .dbNode = dbOid, .relNode = relNumber}; \ - (tag).forkNum = forknum; \ - (tag).blockNum = blkno; \ - (tag).rnode = rnode; \ + RelFileNode rnode = { .spcNode = (spc_oid), .dbNode = (db_oid), .relNode = (rel_number)}; \ + (tag).forkNum = (fork_number); \ + (tag).blockNum = (block_number); \ + (tag).rnode = rnode; \ } while (false) #define InvalidRelFileNumber InvalidOid -#define SMgrRelGetRelInfo(reln) \ - (reln->smgr_rnode.node) +#define SMgrRelGetRelInfo(reln) ((reln)->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers @@ -133,17 +132,16 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, .relNumber = (tag).relNumber, \ }) -#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \ +#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \ do { \ - (tag).forkNum = forknum; \ - (tag).blockNum = blkno; \ - (tag).spcOid = spcOid; \ - (tag).dbOid = dbOid; \ - (tag).relNumber = relNumber; \ + (tag).forkNum = (fork_number); \ + (tag).blockNum = (block_number); \ + (tag).spcOid = (spc_oid); \ + (tag).dbOid = (db_oid); \ + (tag).relNumber = (rel_number); \ } while (false) -#define SMgrRelGetRelInfo(reln) \ - ((reln)->smgr_rlocator) +#define SMgrRelGetRelInfo(reln) ((reln)->smgr_rlocator) #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 60ca1675d9..bf7961574a 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -10,6 +10,7 @@ */ #include "postgres.h" +#include "neon.h" #include "neon_pgversioncompat.h" #include "pagestore_client.h" @@ -49,32 +50,23 @@ typedef struct * algorithm */ } RelSizeHashControl; -static HTAB *relsize_hash; -static LWLockId relsize_lock; -static int relsize_hash_size; -static RelSizeHashControl* relsize_ctl; -static shmem_startup_hook_type prev_shmem_startup_hook = NULL; -#if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook = NULL; -static void relsize_shmem_request(void); -#endif - /* * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB, * which seems reasonable. */ #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) -static void -neon_smgr_shmem_startup(void) +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size = DEFAULT_RELSIZE_HASH_SIZE; +static RelSizeHashControl* relsize_ctl; + +void +RelsizeCacheShmemInit(void) { static HASHCTL info; bool found; - if (prev_shmem_startup_hook) - prev_shmem_startup_hook(); - - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found); if (!found) { @@ -85,7 +77,6 @@ neon_smgr_shmem_startup(void) relsize_hash_size, relsize_hash_size, &info, HASH_ELEM | HASH_BLOBS); - LWLockRelease(AddinShmemInitLock); relsize_ctl->size = 0; relsize_ctl->hits = 0; relsize_ctl->misses = 0; @@ -242,34 +233,15 @@ relsize_hash_init(void) PGC_POSTMASTER, 0, NULL, NULL, NULL); - - if (relsize_hash_size > 0) - { -#if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = relsize_shmem_request; -#else - RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); - RequestNamedLWLockTranche("neon_relsize", 1); -#endif - - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = neon_smgr_shmem_startup; - } } -#if PG_VERSION_NUM >= 150000 /* * shmem_request hook: request additional shared resources. We'll allocate or * attach to the shared resources in neon_smgr_shmem_startup(). */ -static void -relsize_shmem_request(void) +void +RelsizeCacheShmemRequest(void) { - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); RequestNamedLWLockTranche("neon_relsize", 1); } -#endif diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index e3a4022664..19d23925a5 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -377,6 +377,16 @@ typedef struct PageserverFeedback } PageserverFeedback; /* BEGIN_HADRON */ +/** + * WAL proposer is the only backend that will update `sent_bytes` and `last_recorded_time_us`. + * Once the `sent_bytes` reaches the limit, it puts backpressure on PG backends. + * + * A PG backend checks `should_limit` to see if it should hit backpressure. + * - If yes, it also checks the `last_recorded_time_us` to see + * if it's time to push more WALs. This is because the WAL proposer + * only resets `should_limit` to 0 after it is notified about new WALs + * which might take a while. + */ typedef struct WalRateLimiter { /* If the value is 1, PG backends will hit backpressure. */ @@ -384,7 +394,7 @@ typedef struct WalRateLimiter /* The number of bytes sent in the current second. */ uint64 sent_bytes; /* The last recorded time in microsecond. */ - TimestampTz last_recorded_time_us; + pg_atomic_uint64 last_recorded_time_us; } WalRateLimiter; /* END_HADRON */ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index aaf8f43eeb..d43d372c2e 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -83,10 +83,8 @@ static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; static HotStandbyFeedback agg_hs_feedback; -static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); -static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); @@ -99,11 +97,6 @@ static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; -static shmem_startup_hook_type prev_shmem_startup_hook_type; -#if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook = NULL; -static void walproposer_shmem_request(void); -#endif static void WalproposerShmemInit_SyncSafekeeper(void); @@ -117,6 +110,9 @@ static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); static void CheckGracefulShutdown(WalProposer *wp); +// HADRON +shardno_t get_num_shards(void); + static void init_walprop_config(bool syncSafekeepers) { @@ -193,8 +189,6 @@ pg_init_walproposer(void) nwp_register_gucs(); - nwp_prepare_shmem(); - delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -409,6 +403,14 @@ static uint64 backpressure_lag_impl(void) { struct WalproposerShmemState* state = NULL; + + /* BEGIN_HADRON */ + if(max_cluster_size < 0){ + // if max cluster size is not set, then we don't apply backpressure because we're reconfiguring PG + return 0; + } + /* END_HADRON */ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { XLogRecPtr writePtr; @@ -449,8 +451,20 @@ backpressure_lag_impl(void) } state = GetWalpropShmemState(); - if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1) + if (state != NULL && !!pg_atomic_read_u32(&state->wal_rate_limiter.should_limit)) { + TimestampTz now = GetCurrentTimestamp(); + struct WalRateLimiter *limiter = &state->wal_rate_limiter; + uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us); + if (now - last_recorded_time > USECS_PER_SEC) + { + /* + * The backend has past 1 second since the last recorded time and it's time to push more WALs. + * If the backends are pushing WALs too fast, the wal proposer will rate limit them again. + */ + uint32 expected = true; + pg_atomic_compare_exchange_u32(&state->wal_rate_limiter.should_limit, &expected, false); + } return 1; } /* END_HADRON */ @@ -482,12 +496,11 @@ WalproposerShmemSize(void) return sizeof(WalproposerShmemState); } -static bool +void WalproposerShmemInit(void) { bool found; - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); walprop_shared = ShmemInitStruct("Walproposer shared state", sizeof(WalproposerShmemState), &found); @@ -502,11 +515,9 @@ WalproposerShmemInit(void) pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); /* BEGIN_HADRON */ pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); + pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0); /* END_HADRON */ } - LWLockRelease(AddinShmemInitLock); - - return found; } static void @@ -520,6 +531,7 @@ WalproposerShmemInit_SyncSafekeeper(void) pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); /* BEGIN_HADRON */ pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0); + pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0); /* END_HADRON */ } @@ -609,42 +621,15 @@ walprop_register_bgworker(void) /* shmem handling */ -static void -nwp_prepare_shmem(void) -{ -#if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = walproposer_shmem_request; -#else - RequestAddinShmemSpace(WalproposerShmemSize()); -#endif - prev_shmem_startup_hook_type = shmem_startup_hook; - shmem_startup_hook = nwp_shmem_startup_hook; -} - -#if PG_VERSION_NUM >= 150000 /* * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in nwp_shmem_startup_hook(). + * attach to the shared resources in WalproposerShmemInit(). */ -static void -walproposer_shmem_request(void) +void +WalproposerShmemRequest(void) { - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - RequestAddinShmemSpace(WalproposerShmemSize()); } -#endif - -static void -nwp_shmem_startup_hook(void) -{ - if (prev_shmem_startup_hook_type) - prev_shmem_startup_hook_type(); - - WalproposerShmemInit(); -} WalproposerShmemState * GetWalpropShmemState(void) @@ -664,18 +649,19 @@ walprop_pg_get_shmem_state(WalProposer *wp) * Record new ps_feedback in the array with shards and update min_feedback. */ static PageserverFeedback -record_pageserver_feedback(PageserverFeedback *ps_feedback) +record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards) { PageserverFeedback min_feedback; Assert(ps_feedback->present); Assert(ps_feedback->shard_number < MAX_SHARDS); + Assert(ps_feedback->shard_number < num_shards); SpinLockAcquire(&walprop_shared->mutex); - /* Update the number of shards */ - if (ps_feedback->shard_number + 1 > walprop_shared->num_shards) - walprop_shared->num_shards = ps_feedback->shard_number + 1; + // Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive + // a new pageserver feedback. + walprop_shared->num_shards = Max(walprop_shared->num_shards, num_shards); /* Update the feedback */ memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); @@ -1551,18 +1537,18 @@ XLogBroadcastWalProposer(WalProposer *wp) { uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024; struct WalRateLimiter *limiter = &state->wal_rate_limiter; - - if (now - limiter->last_recorded_time_us > USECS_PER_SEC) + uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us); + if (now - last_recorded_time > USECS_PER_SEC) { /* Reset the rate limiter */ - limiter->last_recorded_time_us = now; limiter->sent_bytes = 0; - pg_atomic_exchange_u32(&limiter->should_limit, 0); + pg_atomic_write_u64(&limiter->last_recorded_time_us, now); + pg_atomic_write_u32(&limiter->should_limit, false); } limiter->sent_bytes += (endptr - startptr); if (limiter->sent_bytes > max_wal_bytes) { - pg_atomic_exchange_u32(&limiter->should_limit, 1); + pg_atomic_write_u32(&limiter->should_limit, true); } } /* END_HADRON */ @@ -2041,19 +2027,43 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) if (wp->config->syncSafekeepers) return; + /* handle fresh ps_feedback */ if (sk->appendResponse.ps_feedback.present) { - PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback); + shardno_t num_shards = get_num_shards(); - /* Only one main shard sends non-zero currentClusterSize */ - if (sk->appendResponse.ps_feedback.currentClusterSize > 0) - SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); - - if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + // During shard split, we receive ps_feedback from child shards before + // the split commits and our shard map GUC has been updated. We must + // filter out such feedback here because record_pageserver_feedback() + // doesn't do it. + // + // NB: what we would actually want to happen is that we only receive + // ps_feedback from the parent shards when the split is committed, then + // apply the split to our set of tracked feedback and from here on only + // receive ps_feedback from child shards. This filter condition doesn't + // do that: if we split from N parent to 2N child shards, the first N + // child shards' feedback messages will pass this condition, even before + // the split is committed. That's a bit sloppy, but OK for now. + if (sk->appendResponse.ps_feedback.shard_number < num_shards) { - standby_apply_lsn = min_feedback.disk_consistent_lsn; - needToAdvanceSlot = true; + PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback, num_shards); + + /* Only one main shard sends non-zero currentClusterSize */ + if (sk->appendResponse.ps_feedback.currentClusterSize > 0) + SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + + if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + { + standby_apply_lsn = min_feedback.disk_consistent_lsn; + needToAdvanceSlot = true; + } + } + else + { + // HADRON + elog(DEBUG2, "Ignoring pageserver feedback for unknown shard %d (current shard number %d)", + sk->appendResponse.ps_feedback.shard_number, num_shards); } } diff --git a/poetry.lock b/poetry.lock index b2072bf1bc..a920833fbf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -3068,6 +3068,21 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-unixsocket" +version = "0.4.1" +description = "Use requests to talk HTTP via a UNIX domain socket" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "requests_unixsocket-0.4.1-py3-none-any.whl", hash = "sha256:60c4942e9dbecc2f64d611039fb1dfc25da382083c6434ac0316dca3ff908f4d"}, + {file = "requests_unixsocket-0.4.1.tar.gz", hash = "sha256:b2596158c356ecee68d27ba469a52211230ac6fb0cde8b66afb19f0ed47a1995"}, +] + +[package.dependencies] +requests = ">=1.1" + [[package]] name = "responses" version = "0.25.3" @@ -3844,4 +3859,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "6a1e8ba06b8194bf28d87fd5e184e2ddc2b4a19dffcbe3953b26da3d55c9212f" +content-hash = "b08aba407631b0341d2ef8bf9acffd733bfc7d32b12d344717ab4c7fef697625" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 82fe6818e3..3c3f93c8e3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [features] default = [] testing = ["dep:tokio-postgres"] +rest_broker = ["dep:subzero-core", "dep:ouroboros"] [dependencies] ahash.workspace = true @@ -65,6 +66,7 @@ postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-pos postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true rand.workspace = true +rand_core.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } reqwest = { workspace = true, features = ["rustls-tls-native-roots"] } @@ -105,6 +107,11 @@ uuid.workspace = true x509-cert.workspace = true redis.workspace = true zerocopy.workspace = true +# uncomment this to use the real subzero-core crate +# subzero-core = { git = "https://github.com/neondatabase/subzero", rev = "396264617e78e8be428682f87469bb25429af88a", features = ["postgresql"], optional = true } +# this is a stub for the subzero-core crate +subzero-core = { path = "./subzero_core", features = ["postgresql"], optional = true} +ouroboros = { version = "0.18", optional = true } # jwt stuff jose-jwa = "0.1.2" @@ -127,6 +134,6 @@ pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true walkdir.workspace = true -rand_distr = "0.4" +rand_distr = "0.5" tokio-postgres.workspace = true tracing-test = "0.2" diff --git a/proxy/README.md b/proxy/README.md index ff48f9f323..ce957b90af 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -178,16 +178,24 @@ Create a configuration file called `local_proxy.json` in the root of the repo (u Start the local proxy: ```sh -cargo run --bin local_proxy -- \ - --disable_pg_session_jwt true \ +cargo run --bin local_proxy --features testing -- \ + --disable-pg-session-jwt \ --http 0.0.0.0:7432 ``` -Start the auth broker: +Start the auth/rest broker: + +Note: to enable the rest broker you need to replace the stub subzero-core crate with the real one. + ```sh -LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \ +cargo add -p proxy subzero-core --git https://github.com/neondatabase/subzero --rev 396264617e78e8be428682f87469bb25429af88a +``` + +```sh +LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing,rest_broker -- \ -c server.crt -k server.key \ --is-auth-broker true \ + --is-rest-broker true \ --wss 0.0.0.0:8080 \ --http 0.0.0.0:7002 \ --auth-backend local @@ -205,3 +213,9 @@ curl -k "https://foo.local.neon.build:8080/sql" \ -H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \ -d '{"query":"select 1","params":[]}' ``` + +Make a rest request against the auth broker (rest broker): +```sh +curl -k "https://foo.local.neon.build:8080/database/rest/v1/items?select=id,name&id=eq.1" \ +-H "Authorization: Bearer $NEON_JWT" +``` diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index f561df9202..b06ed3a0ae 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -180,8 +180,6 @@ async fn authenticate( return Err(auth::AuthError::NetworkNotAllowed); } - client.write_message(BeMessage::NoticeResponse("Connecting to database.")); - // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy // everywhere, we can remove this. diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index a716890a00..6eba869870 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -803,7 +803,7 @@ mod tests { use http_body_util::Full; use hyper::service::service_fn; use hyper_util::rt::TokioIo; - use rand::rngs::OsRng; + use rand_core::OsRng; use rsa::pkcs8::DecodePrivateKey; use serde::Serialize; use serde_json::json; diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index 401203d48c..7b9012dc69 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -1,3 +1,4 @@ +use std::env; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; @@ -20,6 +21,8 @@ use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::local::LocalBackend; use crate::auth::{self}; use crate::cancellation::CancellationHandler; +#[cfg(feature = "rest_broker")] +use crate::config::RestConfig; use crate::config::{ self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, refresh_config_loop, @@ -262,6 +265,14 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig timeout: Duration::from_secs(2), }; + let greetings = env::var_os("NEON_MOTD").map_or(String::new(), |s| match s.into_string() { + Ok(s) => s, + Err(_) => { + debug!("NEON_MOTD environment variable is not valid UTF-8"); + String::new() + } + }); + Ok(Box::leak(Box::new(ProxyConfig { tls_config: ArcSwapOption::from(None), metric_collection: None, @@ -276,11 +287,19 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig accept_jwts: true, console_redirect_confirmation_timeout: Duration::ZERO, }, + #[cfg(feature = "rest_broker")] + rest_config: RestConfig { + is_rest_broker: false, + db_schema_cache: None, + max_schema_size: 0, + hostname_prefix: String::new(), + }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, + greetings, #[cfg(feature = "testing")] disable_pg_session_jwt: args.disable_pg_session_jwt, }))) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 4ac8b6a995..f3782312dc 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -76,7 +76,7 @@ fn cli() -> clap::Command { } pub async fn run() -> anyhow::Result<()> { - let _logging_guard = crate::logging::init().await?; + let _logging_guard = crate::logging::init()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 16a7dc7b67..4148f4bc62 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,4 +1,3 @@ -#[cfg(any(test, feature = "testing"))] use std::env; use std::net::SocketAddr; use std::path::PathBuf; @@ -14,14 +13,14 @@ use arc_swap::ArcSwapOption; use camino::Utf8PathBuf; use futures::future::Either; use itertools::{Itertools, Position}; -use rand::{Rng, thread_rng}; +use rand::Rng; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; #[cfg(any(test, feature = "testing"))] use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; @@ -31,6 +30,8 @@ use crate::auth::backend::local::LocalBackend; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; use crate::batch::BatchQueue; use crate::cancellation::{CancellationHandler, CancellationProcessor}; +#[cfg(feature = "rest_broker")] +use crate::config::RestConfig; #[cfg(any(test, feature = "testing"))] use crate::config::refresh_config_loop; use crate::config::{ @@ -47,6 +48,8 @@ use crate::redis::{elasticache, notifications}; use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; +#[cfg(feature = "rest_broker")] +use crate::serverless::rest::DbSchemaCache; use crate::tls::client_config::compute_client_config_with_root_certs; #[cfg(any(test, feature = "testing"))] use crate::url::ApiUrl; @@ -246,11 +249,23 @@ struct ProxyCliArgs { /// if this is not local proxy, this toggles whether we accept Postgres REST requests #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[cfg(feature = "rest_broker")] is_rest_broker: bool, /// cache for `db_schema_cache` introspection (use `size=0` to disable) #[clap(long, default_value = "size=1000,ttl=1h")] + #[cfg(feature = "rest_broker")] db_schema_cache: String, + + /// Maximum size allowed for schema in bytes + #[clap(long, default_value_t = 5 * 1024 * 1024)] // 5MB + #[cfg(feature = "rest_broker")] + max_schema_size: usize, + + /// Hostname prefix to strip from request hostname to get database hostname + #[clap(long, default_value = "apirest.")] + #[cfg(feature = "rest_broker")] + hostname_prefix: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -319,7 +334,7 @@ struct PgSniRouterArgs { } pub async fn run() -> anyhow::Result<()> { - let _logging_guard = crate::logging::init().await?; + let _logging_guard = crate::logging::init()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); @@ -517,6 +532,17 @@ pub async fn run() -> anyhow::Result<()> { )); maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); + // add a task to flush the db_schema cache every 10 minutes + #[cfg(feature = "rest_broker")] + if let Some(db_schema_cache) = &config.rest_config.db_schema_cache { + maintenance_tasks.spawn(async move { + loop { + tokio::time::sleep(Duration::from_secs(600)).await; + db_schema_cache.flush(); + } + }); + } + if let Some(metrics_config) = &config.metric_collection { // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); @@ -547,7 +573,7 @@ pub async fn run() -> anyhow::Result<()> { attempt.into_inner() ); } - let jitter = thread_rng().gen_range(0..100); + let jitter = rand::rng().random_range(0..100); tokio::time::sleep(Duration::from_millis(1000 + jitter)).await; } } @@ -679,6 +705,49 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { timeout: Duration::from_secs(2), }; + #[cfg(feature = "rest_broker")] + let rest_config = { + let db_schema_cache_config: CacheOptions = args.db_schema_cache.parse()?; + info!("Using DbSchemaCache with options={db_schema_cache_config:?}"); + + let db_schema_cache = if args.is_rest_broker { + Some(DbSchemaCache::new( + "db_schema_cache", + db_schema_cache_config.size, + db_schema_cache_config.ttl, + true, + )) + } else { + None + }; + + RestConfig { + is_rest_broker: args.is_rest_broker, + db_schema_cache, + max_schema_size: args.max_schema_size, + hostname_prefix: args.hostname_prefix.clone(), + } + }; + + let mut greetings = env::var_os("NEON_MOTD").map_or(String::new(), |s| match s.into_string() { + Ok(s) => s, + Err(_) => { + debug!("NEON_MOTD environment variable is not valid UTF-8"); + String::new() + } + }); + + match &args.auth_backend { + AuthBackendType::ControlPlane => {} + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Postgres => {} + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Local => {} + AuthBackendType::ConsoleRedirect => { + greetings = "Connected to database".to_string(); + } + } + let config = ProxyConfig { tls_config, metric_collection, @@ -689,8 +758,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, + greetings, #[cfg(feature = "testing")] disable_pg_session_jwt: false, + #[cfg(feature = "rest_broker")] + rest_config, }; let config = Box::leak(Box::new(config)); diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index d37c107323..a589dd175b 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,17 +1,16 @@ use std::collections::{HashMap, HashSet, hash_map}; use std::convert::Infallible; -use std::sync::atomic::AtomicU64; use std::time::Duration; use async_trait::async_trait; use clashmap::ClashMap; use clashmap::mapref::one::Ref; -use rand::{Rng, thread_rng}; -use tokio::sync::Mutex; +use rand::Rng; use tokio::time::Instant; use tracing::{debug, info}; use crate::config::ProjectInfoCacheOptions; +use crate::control_plane::messages::{ControlPlaneErrorMessage, Reason}; use crate::control_plane::{EndpointAccessControl, RoleAccessControl}; use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::types::{EndpointId, RoleName}; @@ -22,52 +21,53 @@ pub(crate) trait ProjectInfoCache { fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt); fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); - async fn decrement_active_listeners(&self); - async fn increment_active_listeners(&self); } struct Entry { - created_at: Instant, + expires_at: Instant, value: T, } impl Entry { - pub(crate) fn new(value: T) -> Self { + pub(crate) fn new(value: T, ttl: Duration) -> Self { Self { - created_at: Instant::now(), + expires_at: Instant::now() + ttl, value, } } - pub(crate) fn get(&self, valid_since: Instant) -> Option<&T> { - (valid_since < self.created_at).then_some(&self.value) + pub(crate) fn get(&self) -> Option<&T> { + (!self.is_expired()).then_some(&self.value) } -} -impl From for Entry { - fn from(value: T) -> Self { - Self::new(value) + fn is_expired(&self) -> bool { + self.expires_at <= Instant::now() } } struct EndpointInfo { - role_controls: HashMap>, - controls: Option>, + role_controls: HashMap>>, + controls: Option>>, } +type ControlPlaneResult = Result>; + impl EndpointInfo { - pub(crate) fn get_role_secret( + pub(crate) fn get_role_secret_with_ttl( &self, role_name: RoleNameInt, - valid_since: Instant, - ) -> Option { - let controls = self.role_controls.get(&role_name)?; - controls.get(valid_since).cloned() + ) -> Option<(ControlPlaneResult, Duration)> { + let entry = self.role_controls.get(&role_name)?; + let ttl = entry.expires_at - Instant::now(); + Some((entry.get()?.clone(), ttl)) } - pub(crate) fn get_controls(&self, valid_since: Instant) -> Option { - let controls = self.controls.as_ref()?; - controls.get(valid_since).cloned() + pub(crate) fn get_controls_with_ttl( + &self, + ) -> Option<(ControlPlaneResult, Duration)> { + let entry = self.controls.as_ref()?; + let ttl = entry.expires_at - Instant::now(); + Some((entry.get()?.clone(), ttl)) } pub(crate) fn invalidate_endpoint(&mut self) { @@ -92,11 +92,8 @@ pub struct ProjectInfoCacheImpl { project2ep: ClashMap>, // FIXME(stefan): we need a way to GC the account2ep map. account2ep: ClashMap>, - config: ProjectInfoCacheOptions, - start_time: Instant, - ttl_disabled_since_us: AtomicU64, - active_listeners_lock: Mutex, + config: ProjectInfoCacheOptions, } #[async_trait] @@ -152,29 +149,6 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - - async fn decrement_active_listeners(&self) { - let mut listeners_guard = self.active_listeners_lock.lock().await; - if *listeners_guard == 0 { - tracing::error!("active_listeners count is already 0, something is broken"); - return; - } - *listeners_guard -= 1; - if *listeners_guard == 0 { - self.ttl_disabled_since_us - .store(u64::MAX, std::sync::atomic::Ordering::SeqCst); - } - } - - async fn increment_active_listeners(&self) { - let mut listeners_guard = self.active_listeners_lock.lock().await; - *listeners_guard += 1; - if *listeners_guard == 1 { - let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; - self.ttl_disabled_since_us - .store(new_ttl, std::sync::atomic::Ordering::SeqCst); - } - } } impl ProjectInfoCacheImpl { @@ -184,9 +158,6 @@ impl ProjectInfoCacheImpl { project2ep: ClashMap::new(), account2ep: ClashMap::new(), config, - ttl_disabled_since_us: AtomicU64::new(u64::MAX), - start_time: Instant::now(), - active_listeners_lock: Mutex::new(0), } } @@ -198,30 +169,28 @@ impl ProjectInfoCacheImpl { self.cache.get(&endpoint_id) } - pub(crate) fn get_role_secret( + pub(crate) fn get_role_secret_with_ttl( &self, endpoint_id: &EndpointId, role_name: &RoleName, - ) -> Option { - let valid_since = self.get_cache_times(); + ) -> Option<(ControlPlaneResult, Duration)> { let role_name = RoleNameInt::get(role_name)?; let endpoint_info = self.get_endpoint_cache(endpoint_id)?; - endpoint_info.get_role_secret(role_name, valid_since) + endpoint_info.get_role_secret_with_ttl(role_name) } - pub(crate) fn get_endpoint_access( + pub(crate) fn get_endpoint_access_with_ttl( &self, endpoint_id: &EndpointId, - ) -> Option { - let valid_since = self.get_cache_times(); + ) -> Option<(ControlPlaneResult, Duration)> { let endpoint_info = self.get_endpoint_cache(endpoint_id)?; - endpoint_info.get_controls(valid_since) + endpoint_info.get_controls_with_ttl() } pub(crate) fn insert_endpoint_access( &self, account_id: Option, - project_id: ProjectIdInt, + project_id: Option, endpoint_id: EndpointIdInt, role_name: RoleNameInt, controls: EndpointAccessControl, @@ -230,26 +199,89 @@ impl ProjectInfoCacheImpl { if let Some(account_id) = account_id { self.insert_account2endpoint(account_id, endpoint_id); } - self.insert_project2endpoint(project_id, endpoint_id); + if let Some(project_id) = project_id { + self.insert_project2endpoint(project_id, endpoint_id); + } if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; } - let controls = Entry::from(controls); - let role_controls = Entry::from(role_controls); + debug!( + key = &*endpoint_id, + "created a cache entry for endpoint access" + ); + + let controls = Some(Entry::new(Ok(controls), self.config.ttl)); + let role_controls = Entry::new(Ok(role_controls), self.config.ttl); match self.cache.entry(endpoint_id) { clashmap::Entry::Vacant(e) => { e.insert(EndpointInfo { role_controls: HashMap::from_iter([(role_name, role_controls)]), - controls: Some(controls), + controls, }); } clashmap::Entry::Occupied(mut e) => { let ep = e.get_mut(); - ep.controls = Some(controls); + ep.controls = controls; + if ep.role_controls.len() < self.config.max_roles { + ep.role_controls.insert(role_name, role_controls); + } + } + } + } + + pub(crate) fn insert_endpoint_access_err( + &self, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, + msg: Box, + ttl: Option, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + + debug!( + key = &*endpoint_id, + "created a cache entry for an endpoint access error" + ); + + let ttl = ttl.unwrap_or(self.config.ttl); + + let controls = if msg.get_reason() == Reason::RoleProtected { + // RoleProtected is the only role-specific error that control plane can give us. + // If a given role name does not exist, it still returns a successful response, + // just with an empty secret. + None + } else { + // We can cache all the other errors in EndpointInfo.controls, + // because they don't depend on what role name we pass to control plane. + Some(Entry::new(Err(msg.clone()), ttl)) + }; + + let role_controls = Entry::new(Err(msg), ttl); + + match self.cache.entry(endpoint_id) { + clashmap::Entry::Vacant(e) => { + e.insert(EndpointInfo { + role_controls: HashMap::from_iter([(role_name, role_controls)]), + controls, + }); + } + clashmap::Entry::Occupied(mut e) => { + let ep = e.get_mut(); + if let Some(entry) = &ep.controls + && !entry.is_expired() + && entry.value.is_ok() + { + // If we have cached non-expired, non-error controls, keep them. + } else { + ep.controls = controls; + } if ep.role_controls.len() < self.config.max_roles { ep.role_controls.insert(role_name, role_controls); } @@ -275,27 +307,6 @@ impl ProjectInfoCacheImpl { } } - fn ignore_ttl_since(&self) -> Option { - let ttl_disabled_since_us = self - .ttl_disabled_since_us - .load(std::sync::atomic::Ordering::Relaxed); - - if ttl_disabled_since_us == u64::MAX { - return None; - } - - Some(self.start_time + Duration::from_micros(ttl_disabled_since_us)) - } - - fn get_cache_times(&self) -> Instant { - let mut valid_since = Instant::now() - self.config.ttl; - if let Some(ignore_ttl_since) = self.ignore_ttl_since() { - // We are fine if entry is not older than ttl or was added before we are getting notifications. - valid_since = valid_since.min(ignore_ttl_since); - } - valid_since - } - pub fn maybe_invalidate_role_secret(&self, endpoint_id: &EndpointId, role_name: &RoleName) { let Some(endpoint_id) = EndpointIdInt::get(endpoint_id) else { return; @@ -313,16 +324,7 @@ impl ProjectInfoCacheImpl { return; }; - let created_at = role_controls.get().created_at; - let expire = match self.ignore_ttl_since() { - // if ignoring TTL, we should still try and roll the password if it's old - // and we the client gave an incorrect password. There could be some lag on the redis channel. - Some(_) => created_at + self.config.ttl < Instant::now(), - // edge case: redis is down, let's be generous and invalidate the cache immediately. - None => true, - }; - - if expire { + if role_controls.get().is_expired() { role_controls.remove(); } } @@ -341,7 +343,7 @@ impl ProjectInfoCacheImpl { } fn gc(&self) { - let shard = thread_rng().gen_range(0..self.project2ep.shards().len()); + let shard = rand::rng().random_range(0..self.project2ep.shards().len()); debug!(shard, "project_info_cache: performing epoch reclamation"); // acquire a random shard lock @@ -361,13 +363,11 @@ impl ProjectInfoCacheImpl { #[cfg(test)] mod tests { - use std::sync::Arc; - use super::*; - use crate::control_plane::messages::EndpointRateLimitConfig; + use crate::control_plane::messages::{Details, EndpointRateLimitConfig, ErrorInfo, Status}; use crate::control_plane::{AccessBlockerFlags, AuthSecret}; use crate::scram::ServerSecret; - use crate::types::ProjectId; + use std::sync::Arc; #[tokio::test] async fn test_project_info_cache_settings() { @@ -378,9 +378,9 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); - let project_id: ProjectId = "project".into(); + let project_id: Option = Some(ProjectIdInt::from(&"project".into())); let endpoint_id: EndpointId = "endpoint".into(); - let account_id: Option = None; + let account_id = None; let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); @@ -393,7 +393,7 @@ mod tests { cache.insert_endpoint_access( account_id, - (&project_id).into(), + project_id, (&endpoint_id).into(), (&user1).into(), EndpointAccessControl { @@ -409,7 +409,7 @@ mod tests { cache.insert_endpoint_access( account_id, - (&project_id).into(), + project_id, (&endpoint_id).into(), (&user2).into(), EndpointAccessControl { @@ -423,11 +423,17 @@ mod tests { }, ); - let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); - assert_eq!(cached.secret, secret1); + let (cached, ttl) = cache + .get_role_secret_with_ttl(&endpoint_id, &user1) + .unwrap(); + assert_eq!(cached.unwrap().secret, secret1); + assert_eq!(ttl, cache.config.ttl); - let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); - assert_eq!(cached.secret, secret2); + let (cached, ttl) = cache + .get_role_secret_with_ttl(&endpoint_id, &user2) + .unwrap(); + assert_eq!(cached.unwrap().secret, secret2); + assert_eq!(ttl, cache.config.ttl); // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); @@ -435,7 +441,7 @@ mod tests { cache.insert_endpoint_access( account_id, - (&project_id).into(), + project_id, (&endpoint_id).into(), (&user3).into(), EndpointAccessControl { @@ -449,17 +455,144 @@ mod tests { }, ); - assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); + assert!( + cache + .get_role_secret_with_ttl(&endpoint_id, &user3) + .is_none() + ); - let cached = cache.get_endpoint_access(&endpoint_id).unwrap(); + let cached = cache + .get_endpoint_access_with_ttl(&endpoint_id) + .unwrap() + .0 + .unwrap(); assert_eq!(cached.allowed_ips, allowed_ips); tokio::time::advance(Duration::from_secs(2)).await; - let cached = cache.get_role_secret(&endpoint_id, &user1); + let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user1); assert!(cached.is_none()); - let cached = cache.get_role_secret(&endpoint_id, &user2); + let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user2); assert!(cached.is_none()); - let cached = cache.get_endpoint_access(&endpoint_id); + let cached = cache.get_endpoint_access_with_ttl(&endpoint_id); assert!(cached.is_none()); } + + #[tokio::test] + async fn test_caching_project_info_errors() { + let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 10, + max_roles: 10, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + }); + let project_id = Some(ProjectIdInt::from(&"project".into())); + let endpoint_id: EndpointId = "endpoint".into(); + let account_id = None; + + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + + let role_msg = Box::new(ControlPlaneErrorMessage { + error: "role is protected and cannot be used for password-based authentication" + .to_owned() + .into_boxed_str(), + http_status_code: http::StatusCode::NOT_FOUND, + status: Some(Status { + code: "PERMISSION_DENIED".to_owned().into_boxed_str(), + message: "role is protected and cannot be used for password-based authentication" + .to_owned() + .into_boxed_str(), + details: Details { + error_info: Some(ErrorInfo { + reason: Reason::RoleProtected, + }), + retry_info: None, + user_facing_message: None, + }, + }), + }); + + let generic_msg = Box::new(ControlPlaneErrorMessage { + error: "oh noes".to_owned().into_boxed_str(), + http_status_code: http::StatusCode::NOT_FOUND, + status: None, + }); + + let get_role_secret = |endpoint_id, role_name| { + cache + .get_role_secret_with_ttl(endpoint_id, role_name) + .unwrap() + .0 + }; + let get_endpoint_access = + |endpoint_id| cache.get_endpoint_access_with_ttl(endpoint_id).unwrap().0; + + // stores role-specific errors only for get_role_secret + cache.insert_endpoint_access_err( + (&endpoint_id).into(), + (&user1).into(), + role_msg.clone(), + None, + ); + assert_eq!( + get_role_secret(&endpoint_id, &user1).unwrap_err().error, + role_msg.error + ); + assert!(cache.get_endpoint_access_with_ttl(&endpoint_id).is_none()); + + // stores non-role specific errors for both get_role_secret and get_endpoint_access + cache.insert_endpoint_access_err( + (&endpoint_id).into(), + (&user1).into(), + generic_msg.clone(), + None, + ); + assert_eq!( + get_role_secret(&endpoint_id, &user1).unwrap_err().error, + generic_msg.error + ); + assert_eq!( + get_endpoint_access(&endpoint_id).unwrap_err().error, + generic_msg.error + ); + + // error isn't returned for other roles in the same endpoint + assert!( + cache + .get_role_secret_with_ttl(&endpoint_id, &user2) + .is_none() + ); + + // success for a role does not overwrite errors for other roles + cache.insert_endpoint_access( + account_id, + project_id, + (&endpoint_id).into(), + (&user2).into(), + EndpointAccessControl { + allowed_ips: Arc::new(vec![]), + allowed_vpce: Arc::new(vec![]), + flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), + }, + RoleAccessControl { + secret: secret.clone(), + }, + ); + assert!(get_role_secret(&endpoint_id, &user1).is_err()); + assert!(get_role_secret(&endpoint_id, &user2).is_ok()); + // ...but does clear the access control error + assert!(get_endpoint_access(&endpoint_id).is_ok()); + + // storing an error does not overwrite successful access control response + cache.insert_endpoint_access_err( + (&endpoint_id).into(), + (&user2).into(), + generic_msg.clone(), + None, + ); + assert!(get_role_secret(&endpoint_id, &user2).is_err()); + assert!(get_endpoint_access(&endpoint_id).is_ok()); + } } diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index e87cf53ab9..0a7fb40b0c 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -204,6 +204,11 @@ impl TimedLru { self.insert_raw_ttl(key, value, ttl, false); } + #[cfg(feature = "rest_broker")] + pub(crate) fn insert(&self, key: K, value: V) { + self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval); + } + pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { let (_, old) = self.insert_raw(key.clone(), value); @@ -214,6 +219,29 @@ impl TimedLru { (old, cached) } + + #[cfg(feature = "rest_broker")] + pub(crate) fn flush(&self) { + let now = Instant::now(); + let mut cache = self.cache.lock(); + + // Collect keys of expired entries first + let expired_keys: Vec<_> = cache + .iter() + .filter_map(|(key, entry)| { + if entry.expires_at <= now { + Some(key.clone()) + } else { + None + } + }) + .collect(); + + // Remove expired entries + for key in expired_keys { + cache.remove(&key); + } + } } impl TimedLru { diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 77062d3bb4..f25121331f 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -32,8 +32,11 @@ use crate::util::run_until; type IpSubnetKey = IpNet; -const CANCEL_KEY_TTL: Duration = Duration::from_secs(600); -const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570); +/// Initial period and TTL is shorter to clear keys of short-lived connections faster. +const CANCEL_KEY_INITIAL_PERIOD: Duration = Duration::from_secs(60); +const CANCEL_KEY_REFRESH_PERIOD: Duration = Duration::from_secs(10 * 60); +/// `CANCEL_KEY_TTL_SLACK` is added to the periods to determine the actual TTL. +const CANCEL_KEY_TTL_SLACK: Duration = Duration::from_secs(30); // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -54,6 +57,24 @@ pub enum CancelKeyOp { }, } +impl CancelKeyOp { + const fn redis_msg_kind(&self) -> RedisMsgKind { + match self { + CancelKeyOp::Store { .. } => RedisMsgKind::Set, + CancelKeyOp::Refresh { .. } => RedisMsgKind::Expire, + CancelKeyOp::Get { .. } => RedisMsgKind::Get, + CancelKeyOp::GetOld { .. } => RedisMsgKind::HGet, + } + } + + fn cancel_channel_metric_guard(&self) -> CancelChannelSizeGuard<'static> { + Metrics::get() + .proxy + .cancel_channel_size + .guard(self.redis_msg_kind()) + } +} + #[derive(thiserror::Error, Debug, Clone)] pub enum PipelineError { #[error("could not send cmd to redis: {0}")] @@ -483,50 +504,49 @@ impl Session { let mut cancel = pin!(cancel); enum State { - Set, + Init, Refresh, } - let mut state = State::Set; + let mut state = State::Init; loop { - let guard_op = match state { - State::Set => { - let guard = Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::Set); - let op = CancelKeyOp::Store { - key: self.key, - value: closure_json.clone(), - expire: CANCEL_KEY_TTL, - }; + let (op, mut wait_interval) = match state { + State::Init => { tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "registering cancellation key" ); - (guard, op) + ( + CancelKeyOp::Store { + key: self.key, + value: closure_json.clone(), + expire: CANCEL_KEY_INITIAL_PERIOD + CANCEL_KEY_TTL_SLACK, + }, + CANCEL_KEY_INITIAL_PERIOD, + ) } State::Refresh => { - let guard = Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::Expire); - let op = CancelKeyOp::Refresh { - key: self.key, - expire: CANCEL_KEY_TTL, - }; tracing::debug!( src=%self.key, dest=?cancel_closure.cancel_token, "refreshing cancellation key" ); - (guard, op) + ( + CancelKeyOp::Refresh { + key: self.key, + expire: CANCEL_KEY_REFRESH_PERIOD + CANCEL_KEY_TTL_SLACK, + }, + CANCEL_KEY_REFRESH_PERIOD, + ) } }; - match tx.call(guard_op, cancel.as_mut()).await { + match tx + .call((op.cancel_channel_metric_guard(), op), cancel.as_mut()) + .await + { // SET returns OK Ok(Value::Okay) => { tracing::debug!( @@ -549,23 +569,23 @@ impl Session { Ok(_) => { // Any other response likely means the key expired. tracing::warn!(src=%self.key, "refreshing cancellation key failed"); - // Re-enter the SET loop to repush full data. - state = State::Set; + // Re-enter the SET loop quickly to repush full data. + state = State::Init; + wait_interval = Duration::ZERO; } // retry immediately. Err(BatchQueueError::Result(error)) => { tracing::warn!(?error, "error refreshing cancellation key"); // Small delay to prevent busy loop with high cpu and logging. - tokio::time::sleep(Duration::from_millis(10)).await; - continue; + wait_interval = Duration::from_millis(10); } Err(BatchQueueError::Cancelled(Err(_cancelled))) => break, } // wait before continuing. break immediately if cancelled. - if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut()) + if run_until(tokio::time::sleep(wait_interval), cancel.as_mut()) .await .is_err() { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6157dc8a6a..16b1dff5f4 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -22,6 +22,8 @@ use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig}; use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; +#[cfg(feature = "rest_broker")] +use crate::serverless::rest::DbSchemaCache; pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::{Host, RoleName}; @@ -30,11 +32,14 @@ pub struct ProxyConfig { pub metric_collection: Option, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, + #[cfg(feature = "rest_broker")] + pub rest_config: RestConfig, pub proxy_protocol_v2: ProxyProtocolV2, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute: ComputeConfig, + pub greetings: String, // Greeting message sent to the client after connection establishment and contains session_id. #[cfg(feature = "testing")] pub disable_pg_session_jwt: bool, } @@ -80,6 +85,14 @@ pub struct AuthenticationConfig { pub console_redirect_confirmation_timeout: tokio::time::Duration, } +#[cfg(feature = "rest_broker")] +pub struct RestConfig { + pub is_rest_broker: bool, + pub db_schema_cache: Option, + pub max_schema_size: usize, + pub hostname_prefix: String, +} + #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub remote_storage_config: Option, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 041a56e032..014317d823 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -233,7 +233,13 @@ pub(crate) async fn handle_client( let session = cancellation_handler.get_key(); - finish_client_init(&pg_settings, *session.key(), &mut stream); + finish_client_init( + ctx, + &pg_settings, + *session.key(), + &mut stream, + &config.greetings, + ); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 4d8df19476..715b818b98 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -523,29 +523,29 @@ mod tests { fn generate_request_data(rng: &mut impl Rng) -> RequestData { RequestData { - session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(), - peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(), + session_id: uuid::Builder::from_random_bytes(rng.random()).into_uuid(), + peer_addr: Ipv4Addr::from(rng.random::<[u8; 4]>()).to_string(), timestamp: chrono::DateTime::from_timestamp_millis( - rng.gen_range(1703862754..1803862754), + rng.random_range(1703862754..1803862754), ) .unwrap() .naive_utc(), application_name: Some("test".to_owned()), user_agent: Some("test-user-agent".to_owned()), - username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), - endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), - database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), - project: Some(hex::encode(rng.r#gen::<[u8; 16]>())), - branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + username: Some(hex::encode(rng.random::<[u8; 4]>())), + endpoint_id: Some(hex::encode(rng.random::<[u8; 16]>())), + database: Some(hex::encode(rng.random::<[u8; 16]>())), + project: Some(hex::encode(rng.random::<[u8; 16]>())), + branch: Some(hex::encode(rng.random::<[u8; 16]>())), pg_options: None, auth_method: None, jwt_issuer: None, - protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], + protocol: ["tcp", "ws", "http"][rng.random_range(0..3)], region: String::new(), error: None, - success: rng.r#gen(), + success: rng.random(), cold_start_info: "no", - duration_us: rng.gen_range(0..30_000_000), + duration_us: rng.random_range(0..30_000_000), disconnect_timestamp: None, } } @@ -622,15 +622,15 @@ mod tests { assert_eq!( file_stats, [ - (1313953, 3, 6000), - (1313942, 3, 6000), - (1314001, 3, 6000), - (1313958, 3, 6000), - (1314094, 3, 6000), - (1313931, 3, 6000), - (1313725, 3, 6000), - (1313960, 3, 6000), - (438318, 1, 2000) + (1313878, 3, 6000), + (1313891, 3, 6000), + (1314058, 3, 6000), + (1313914, 3, 6000), + (1313760, 3, 6000), + (1314084, 3, 6000), + (1313965, 3, 6000), + (1313911, 3, 6000), + (438290, 1, 2000) ] ); @@ -662,11 +662,11 @@ mod tests { assert_eq!( file_stats, [ - (1205810, 5, 10000), - (1205534, 5, 10000), - (1205835, 5, 10000), - (1205820, 5, 10000), - (1206074, 5, 10000) + (1206039, 5, 10000), + (1205798, 5, 10000), + (1205776, 5, 10000), + (1206051, 5, 10000), + (1205746, 5, 10000) ] ); @@ -691,15 +691,15 @@ mod tests { assert_eq!( file_stats, [ - (1313953, 3, 6000), - (1313942, 3, 6000), - (1314001, 3, 6000), - (1313958, 3, 6000), - (1314094, 3, 6000), - (1313931, 3, 6000), - (1313725, 3, 6000), - (1313960, 3, 6000), - (438318, 1, 2000) + (1313878, 3, 6000), + (1313891, 3, 6000), + (1314058, 3, 6000), + (1313914, 3, 6000), + (1313760, 3, 6000), + (1314084, 3, 6000), + (1313965, 3, 6000), + (1313911, 3, 6000), + (438290, 1, 2000) ] ); @@ -736,7 +736,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)] + [(658552, 2, 3001), (658265, 2, 3000), (658061, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index bb785b8b0c..8a0403c0b0 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -68,6 +68,66 @@ impl NeonControlPlaneClient { self.endpoint.url().as_str() } + async fn get_and_cache_auth_info( + &self, + ctx: &RequestContext, + endpoint: &EndpointId, + role: &RoleName, + cache_key: &EndpointId, + extract: impl FnOnce(&EndpointAccessControl, &RoleAccessControl) -> T, + ) -> Result { + match self.do_get_auth_req(ctx, endpoint, role).await { + Ok(auth_info) => { + let control = EndpointAccessControl { + allowed_ips: Arc::new(auth_info.allowed_ips), + allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), + flags: auth_info.access_blocker_flags, + rate_limits: auth_info.rate_limits, + }; + let role_control = RoleAccessControl { + secret: auth_info.secret, + }; + let res = extract(&control, &role_control); + + self.caches.project_info.insert_endpoint_access( + auth_info.account_id, + auth_info.project_id, + cache_key.into(), + role.into(), + control, + role_control, + ); + + if let Some(project_id) = auth_info.project_id { + ctx.set_project_id(project_id); + } + + Ok(res) + } + Err(err) => match err { + GetAuthInfoError::ApiError(ControlPlaneError::Message(ref msg)) => { + let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info); + + // If we can retry this error, do not cache it, + // unless we were given a retry delay. + if msg.could_retry() && retry_info.is_none() { + return Err(err); + } + + self.caches.project_info.insert_endpoint_access_err( + cache_key.into(), + role.into(), + msg.clone(), + retry_info.map(|r| Duration::from_millis(r.retry_delay_ms)), + ); + + Err(err) + } + err => Err(err), + }, + } + } + async fn do_get_auth_req( &self, ctx: &RequestContext, @@ -284,43 +344,34 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { ctx: &RequestContext, endpoint: &EndpointId, role: &RoleName, - ) -> Result { - let normalized_ep = &endpoint.normalize(); - if let Some(secret) = self + ) -> Result { + let key = endpoint.normalize(); + + if let Some((role_control, ttl)) = self .caches .project_info - .get_role_secret(normalized_ep, role) + .get_role_secret_with_ttl(&key, role) { - return Ok(secret); + return match role_control { + Err(mut msg) => { + info!(key = &*key, "found cached get_role_access_control error"); + + // if retry_delay_ms is set change it to the remaining TTL + replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64); + + Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg))) + } + Ok(role_control) => { + debug!(key = &*key, "found cached role access control"); + Ok(role_control) + } + }; } - let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?; - - let control = EndpointAccessControl { - allowed_ips: Arc::new(auth_info.allowed_ips), - allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), - flags: auth_info.access_blocker_flags, - rate_limits: auth_info.rate_limits, - }; - let role_control = RoleAccessControl { - secret: auth_info.secret, - }; - - if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); - - self.caches.project_info.insert_endpoint_access( - auth_info.account_id, - project_id, - normalized_ep_int, - role.into(), - control, - role_control.clone(), - ); - ctx.set_project_id(project_id); - } - - Ok(role_control) + self.get_and_cache_auth_info(ctx, endpoint, role, &key, |_, role_control| { + role_control.clone() + }) + .await } #[tracing::instrument(skip_all)] @@ -330,38 +381,30 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { endpoint: &EndpointId, role: &RoleName, ) -> Result { - let normalized_ep = &endpoint.normalize(); - if let Some(control) = self.caches.project_info.get_endpoint_access(normalized_ep) { - return Ok(control); + let key = endpoint.normalize(); + + if let Some((control, ttl)) = self.caches.project_info.get_endpoint_access_with_ttl(&key) { + return match control { + Err(mut msg) => { + info!( + key = &*key, + "found cached get_endpoint_access_control error" + ); + + // if retry_delay_ms is set change it to the remaining TTL + replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64); + + Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg))) + } + Ok(control) => { + debug!(key = &*key, "found cached endpoint access control"); + Ok(control) + } + }; } - let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?; - - let control = EndpointAccessControl { - allowed_ips: Arc::new(auth_info.allowed_ips), - allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), - flags: auth_info.access_blocker_flags, - rate_limits: auth_info.rate_limits, - }; - let role_control = RoleAccessControl { - secret: auth_info.secret, - }; - - if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); - - self.caches.project_info.insert_endpoint_access( - auth_info.account_id, - project_id, - normalized_ep_int, - role.into(), - control.clone(), - role_control, - ); - ctx.set_project_id(project_id); - } - - Ok(control) + self.get_and_cache_auth_info(ctx, endpoint, role, &key, |control, _| control.clone()) + .await } #[tracing::instrument(skip_all)] @@ -390,13 +433,9 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { info!(key = &*key, "found cached wake_compute error"); // if retry_delay_ms is set, reduce it by the amount of time it spent in cache - if let Some(status) = &mut msg.status { - if let Some(retry_info) = &mut status.details.retry_info { - retry_info.retry_delay_ms = retry_info - .retry_delay_ms - .saturating_sub(created_at.elapsed().as_millis() as u64) - } - } + replace_retry_delay_ms(&mut msg, |delay| { + delay.saturating_sub(created_at.elapsed().as_millis() as u64) + }); Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( msg, @@ -478,6 +517,14 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { } } +fn replace_retry_delay_ms(msg: &mut ControlPlaneErrorMessage, f: impl FnOnce(u64) -> u64) { + if let Some(status) = &mut msg.status + && let Some(retry_info) = &mut status.details.retry_info + { + retry_info.retry_delay_ms = f(retry_info.retry_delay_ms); + } +} + /// Parse http response body, taking status code into account. fn parse_body serde::Deserialize<'a>>( status: StatusCode, diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index 12843e48c7..1e43010957 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -52,7 +52,7 @@ impl ReportableError for ControlPlaneError { | Reason::EndpointNotFound | Reason::EndpointDisabled | Reason::BranchNotFound - | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User, + | Reason::WrongLsnOrTimestamp => ErrorKind::User, Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index cf193ed268..d44d7efcc3 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -107,7 +107,7 @@ pub(crate) struct ErrorInfo { // Schema could also have `metadata` field, but it's not structured. Skip it for now. } -#[derive(Clone, Copy, Debug, Deserialize, Default)] +#[derive(Clone, Copy, Debug, Deserialize, Default, PartialEq, Eq)] pub(crate) enum Reason { /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles. #[serde(rename = "ROLE_PROTECTED")] @@ -133,9 +133,9 @@ pub(crate) enum Reason { /// or that the subject doesn't have enough permissions to access the requested branch. #[serde(rename = "BRANCH_NOT_FOUND")] BranchNotFound, - /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong. - #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")] - InvalidEphemeralEndpointOptions, + /// WrongLsnOrTimestamp indicates that the specified LSN or timestamp are wrong. + #[serde(rename = "WRONG_LSN_OR_TIMESTAMP")] + WrongLsnOrTimestamp, /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded. #[serde(rename = "RATE_LIMIT_EXCEEDED")] RateLimitExceeded, @@ -205,7 +205,7 @@ impl Reason { | Reason::EndpointNotFound | Reason::EndpointDisabled | Reason::BranchNotFound - | Reason::InvalidEphemeralEndpointOptions => false, + | Reason::WrongLsnOrTimestamp => false, // we were asked to go away Reason::RateLimitExceeded | Reason::NonDefaultBranchComputeTimeExceeded @@ -257,19 +257,19 @@ pub(crate) struct GetEndpointAccessControl { pub(crate) rate_limits: EndpointRateLimitConfig, } -#[derive(Copy, Clone, Deserialize, Default)] +#[derive(Copy, Clone, Deserialize, Default, Debug)] pub struct EndpointRateLimitConfig { pub connection_attempts: ConnectionAttemptsLimit, } -#[derive(Copy, Clone, Deserialize, Default)] +#[derive(Copy, Clone, Deserialize, Default, Debug)] pub struct ConnectionAttemptsLimit { pub tcp: Option, pub ws: Option, pub http: Option, } -#[derive(Copy, Clone, Deserialize)] +#[derive(Copy, Clone, Deserialize, Debug)] pub struct LeakyBucketSetting { pub rps: f64, pub burst: f64, diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index a8c59dad0c..9bbd3f4fb7 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -82,7 +82,7 @@ impl NodeInfo { } } -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, Debug)] pub(crate) struct AccessBlockerFlags { pub public_access_blocked: bool, pub vpc_access_blocked: bool, @@ -92,12 +92,12 @@ pub(crate) type NodeInfoCache = TimedLru>>; pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct RoleAccessControl { pub secret: Option, } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct EndpointAccessControl { pub allowed_ips: Arc>, pub allowed_vpce: Arc>, diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index d7e39ebaf4..825f2d1049 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -247,7 +247,7 @@ mod tests { use rand::{Rng, SeedableRng}; use rand_distr::Zipf; - let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); + let endpoint_dist = Zipf::new(500000.0, 0.8).unwrap(); let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); let interner = MyId::get_interner(); diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index e608300bd2..0abb500608 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,12 +1,10 @@ use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; use std::{env, io}; use chrono::{DateTime, Utc}; use opentelemetry::trace::TraceContextExt; -use serde::ser::{SerializeMap, Serializer}; use tracing::subscriber::Interest; use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; use tracing_opentelemetry::OpenTelemetrySpanExt; @@ -16,7 +14,9 @@ use tracing_subscriber::fmt::time::SystemTime; use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; -use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use tracing_subscriber::registry::LookupSpan; + +use crate::metrics::Metrics; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -26,7 +26,7 @@ use tracing_subscriber::registry::{LookupSpan, SpanRef}; /// configuration from environment variables. For example, to change the /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. /// See -pub async fn init() -> anyhow::Result { +pub fn init() -> anyhow::Result { let logfmt = LogFormat::from_env()?; let env_filter = EnvFilter::builder() @@ -43,8 +43,8 @@ pub async fn init() -> anyhow::Result { .expect("this should be a valid filter directive"), ); - let otlp_layer = - tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await; + let provider = tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()); + let otlp_layer = provider.as_ref().map(tracing_utils::layer); let json_log_layer = if logfmt == LogFormat::Json { Some(JsonLoggingLayer::new( @@ -76,7 +76,7 @@ pub async fn init() -> anyhow::Result { .with(text_log_layer) .try_init()?; - Ok(LoggingGuard) + Ok(LoggingGuard(provider)) } /// Initialize logging for local_proxy with log prefix and no opentelemetry. @@ -97,7 +97,7 @@ pub fn init_local_proxy() -> anyhow::Result { .with(fmt_layer) .try_init()?; - Ok(LoggingGuard) + Ok(LoggingGuard(None)) } pub struct LocalProxyFormatter(Format); @@ -118,14 +118,16 @@ where } } -pub struct LoggingGuard; +pub struct LoggingGuard(Option); impl Drop for LoggingGuard { fn drop(&mut self) { - // Shutdown trace pipeline gracefully, so that it has a chance to send any - // pending traces before we exit. - tracing::info!("shutting down the tracing machinery"); - tracing_utils::shutdown_tracing(); + if let Some(p) = &self.0 { + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing::info!("shutting down the tracing machinery"); + drop(p.shutdown()); + } } } @@ -210,6 +212,9 @@ struct JsonLoggingLayer { /// tracks which fields of each **event** are duplicates skipped_field_indices: CallsiteMap, + /// tracks callsite names to an ID. + callsite_name_ids: papaya::HashMap<&'static str, u32, ahash::RandomState>, + span_info: CallsiteMap, /// Fields we want to keep track of in a separate json object. @@ -222,6 +227,7 @@ impl JsonLoggingLayer { clock, skipped_field_indices: CallsiteMap::default(), span_info: CallsiteMap::default(), + callsite_name_ids: papaya::HashMap::default(), writer, extract_fields, } @@ -232,7 +238,7 @@ impl JsonLoggingLayer { self.span_info .pin() .get_or_insert_with(metadata.callsite(), || { - CallsiteSpanInfo::new(metadata, self.extract_fields) + CallsiteSpanInfo::new(&self.callsite_name_ids, metadata, self.extract_fields) }) .clone() } @@ -249,7 +255,7 @@ where // early, before OTel machinery, and add as event extension. let now = self.clock.now(); - let res: io::Result<()> = EVENT_FORMATTER.with(|f| { + EVENT_FORMATTER.with(|f| { let mut borrow = f.try_borrow_mut(); let formatter = match borrow.as_deref_mut() { Ok(formatter) => formatter, @@ -259,31 +265,19 @@ where Err(_) => &mut EventFormatter::new(), }; - formatter.reset(); formatter.format( now, event, &ctx, &self.skipped_field_indices, self.extract_fields, - )?; - self.writer.make_writer().write_all(formatter.buffer()) - }); + ); - // In case logging fails we generate a simpler JSON object. - if let Err(err) = res - && let Ok(mut line) = serde_json::to_vec(&serde_json::json!( { - "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true), - "level": "ERROR", - "message": format_args!("cannot log event: {err:?}"), - "fields": { - "event": format_args!("{event:?}"), - }, - })) - { - line.push(b'\n'); - self.writer.make_writer().write_all(&line).ok(); - } + let mut writer = self.writer.make_writer(); + if writer.write_all(formatter.buffer()).is_err() { + Metrics::get().proxy.logging_errors_count.inc(); + } + }); } /// Registers a SpanFields instance as span extension. @@ -356,10 +350,11 @@ struct CallsiteSpanInfo { } impl CallsiteSpanInfo { - fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self { - // Start at 1 to reserve 0 for default. - static COUNTER: AtomicU32 = AtomicU32::new(1); - + fn new( + callsite_name_ids: &papaya::HashMap<&'static str, u32, ahash::RandomState>, + metadata: &'static Metadata<'static>, + extract_fields: &[&'static str], + ) -> Self { let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect(); // get all the indices of span fields we want to focus @@ -372,8 +367,18 @@ impl CallsiteSpanInfo { // normalized_name is unique for each callsite, but it is not // unified across separate proxy instances. // todo: can we do better here? - let cid = COUNTER.fetch_add(1, Ordering::Relaxed); - let normalized_name = format!("{}#{cid}", metadata.name()).into(); + let cid = *callsite_name_ids + .pin() + .update_or_insert(metadata.name(), |&cid| cid + 1, 0); + + // we hope that most span names are unique, in which case this will always be 0 + let normalized_name = if cid == 0 { + metadata.name().into() + } else { + // if the span name is not unique, add the numeric ID to span name to distinguish it. + // sadly this is non-determinstic, across restarts but we should fix it by disambiguating re-used span names instead. + format!("{}#{cid}", metadata.name()).into() + }; Self { extract, @@ -382,9 +387,24 @@ impl CallsiteSpanInfo { } } +#[derive(Clone)] +struct RawValue(Box<[u8]>); + +impl RawValue { + fn new(v: impl json::ValueEncoder) -> Self { + Self(json::value_to_vec!(|val| v.encode(val)).into_boxed_slice()) + } +} + +impl json::ValueEncoder for &RawValue { + fn encode(self, v: json::ValueSer<'_>) { + v.write_raw_json(&self.0); + } +} + /// Stores span field values recorded during the spans lifetime. struct SpanFields { - values: [serde_json::Value; MAX_TRACING_FIELDS], + values: [Option; MAX_TRACING_FIELDS], /// cached span info so we can avoid extra hashmap lookups in the hot path. span_info: CallsiteSpanInfo, @@ -394,7 +414,7 @@ impl SpanFields { fn new(span_info: CallsiteSpanInfo) -> Self { Self { span_info, - values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS], + values: [const { None }; MAX_TRACING_FIELDS], } } } @@ -402,55 +422,55 @@ impl SpanFields { impl tracing::field::Visit for SpanFields { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { if let Ok(value) = i64::try_from(value) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } else { - self.values[field.index()] = serde_json::Value::from(format!("{value}")); + self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { if let Ok(value) = u64::try_from(value) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } else { - self.values[field.index()] = serde_json::Value::from(format!("{value}")); + self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { - self.values[field.index()] = serde_json::Value::from(value); + self.values[field.index()] = Some(RawValue::new(value)); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { - self.values[field.index()] = serde_json::Value::from(format!("{value:?}")); + self.values[field.index()] = Some(RawValue::new(format_args!("{value:?}"))); } #[inline] @@ -459,7 +479,7 @@ impl tracing::field::Visit for SpanFields { field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { - self.values[field.index()] = serde_json::Value::from(format!("{value}")); + self.values[field.index()] = Some(RawValue::new(format_args!("{value}"))); } } @@ -508,11 +528,6 @@ impl EventFormatter { &self.logline_buffer } - #[inline] - fn reset(&mut self) { - self.logline_buffer.clear(); - } - fn format( &mut self, now: DateTime, @@ -520,8 +535,7 @@ impl EventFormatter { ctx: &Context<'_, S>, skipped_field_indices: &CallsiteMap, extract_fields: &'static [&'static str], - ) -> io::Result<()> - where + ) where S: Subscriber + for<'a> LookupSpan<'a>, { let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true); @@ -536,78 +550,99 @@ impl EventFormatter { .copied() .unwrap_or_default(); - let mut serialize = || { - let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer); - - let mut serializer = serializer.serialize_map(None)?; - + self.logline_buffer.clear(); + let serializer = json::ValueSer::new(&mut self.logline_buffer); + json::value_as_object!(|serializer| { // Timestamp comes first, so raw lines can be sorted by timestamp. - serializer.serialize_entry("timestamp", ×tamp)?; + serializer.entry("timestamp", &*timestamp); // Level next. - serializer.serialize_entry("level", &meta.level().as_str())?; + serializer.entry("level", meta.level().as_str()); // Message next. - serializer.serialize_key("message")?; let mut message_extractor = - MessageFieldExtractor::new(serializer, skipped_field_indices); + MessageFieldExtractor::new(serializer.key("message"), skipped_field_indices); event.record(&mut message_extractor); - let mut serializer = message_extractor.into_serializer()?; + message_extractor.finish(); // Direct message fields. - let mut fields_present = FieldsPresent(false, skipped_field_indices); - event.record(&mut fields_present); - if fields_present.0 { - serializer.serialize_entry( - "fields", - &SerializableEventFields(event, skipped_field_indices), - )?; + { + let mut message_skipper = MessageFieldSkipper::new( + serializer.key("fields").object(), + skipped_field_indices, + ); + event.record(&mut message_skipper); + + // rollback if no fields are present. + if message_skipper.present { + message_skipper.serializer.finish(); + } } - let spans = SerializableSpans { - // collect all spans from parent to root. - spans: ctx + let mut extracted = ExtractedSpanFields::new(extract_fields); + + let spans = serializer.key("spans"); + json::value_as_object!(|spans| { + let parent_spans = ctx .event_span(event) - .map_or(vec![], |parent| parent.scope().collect()), - extracted: ExtractedSpanFields::new(extract_fields), - }; - serializer.serialize_entry("spans", &spans)?; + .map_or(vec![], |parent| parent.scope().collect()); + + for span in parent_spans.iter().rev() { + let ext = span.extensions(); + + // all spans should have this extension. + let Some(fields) = ext.get() else { continue }; + + extracted.layer_span(fields); + + let SpanFields { values, span_info } = fields; + + let span_fields = spans.key(&*span_info.normalized_name); + json::value_as_object!(|span_fields| { + for (field, value) in std::iter::zip(span.metadata().fields(), values) { + if let Some(value) = value { + span_fields.entry(field.name(), value); + } + } + }); + } + }); // TODO: thread-local cache? let pid = std::process::id(); // Skip adding pid 1 to reduce noise for services running in containers. if pid != 1 { - serializer.serialize_entry("process_id", &pid)?; + serializer.entry("process_id", pid); } - THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?; + THREAD_ID.with(|tid| serializer.entry("thread_id", tid)); // TODO: tls cache? name could change if let Some(thread_name) = std::thread::current().name() && !thread_name.is_empty() && thread_name != "tokio-runtime-worker" { - serializer.serialize_entry("thread_name", thread_name)?; + serializer.entry("thread_name", thread_name); } if let Some(task_id) = tokio::task::try_id() { - serializer.serialize_entry("task_id", &format_args!("{task_id}"))?; + serializer.entry("task_id", format_args!("{task_id}")); } - serializer.serialize_entry("target", meta.target())?; + serializer.entry("target", meta.target()); // Skip adding module if it's the same as target. if let Some(module) = meta.module_path() && module != meta.target() { - serializer.serialize_entry("module", module)?; + serializer.entry("module", module); } if let Some(file) = meta.file() { if let Some(line) = meta.line() { - serializer.serialize_entry("src", &format_args!("{file}:{line}"))?; + serializer.entry("src", format_args!("{file}:{line}")); } else { - serializer.serialize_entry("src", file)?; + serializer.entry("src", file); } } @@ -616,124 +651,104 @@ impl EventFormatter { let otel_spanref = otel_context.span(); let span_context = otel_spanref.span_context(); if span_context.is_valid() { - serializer.serialize_entry( - "trace_id", - &format_args!("{}", span_context.trace_id()), - )?; + serializer.entry("trace_id", format_args!("{}", span_context.trace_id())); } } - if spans.extracted.has_values() { + if extracted.has_values() { // TODO: add fields from event, too? - serializer.serialize_entry("extract", &spans.extracted)?; + let extract = serializer.key("extract"); + json::value_as_object!(|extract| { + for (key, value) in std::iter::zip(extracted.names, extracted.values) { + if let Some(value) = value { + extract.entry(*key, &value); + } + } + }); } + }); - serializer.end() - }; - - serialize().map_err(io::Error::other)?; self.logline_buffer.push(b'\n'); - Ok(()) } } /// Extracts the message field that's mixed will other fields. -struct MessageFieldExtractor { - serializer: S, +struct MessageFieldExtractor<'buf> { + serializer: Option>, skipped_field_indices: SkippedFieldIndices, - state: Option>, } -impl MessageFieldExtractor { +impl<'buf> MessageFieldExtractor<'buf> { #[inline] - fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { + fn new(serializer: json::ValueSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self { Self { - serializer, + serializer: Some(serializer), skipped_field_indices, - state: None, } } #[inline] - fn into_serializer(mut self) -> Result { - match self.state { - Some(Ok(())) => {} - Some(Err(err)) => return Err(err), - None => self.serializer.serialize_value("")?, + fn finish(self) { + if let Some(ser) = self.serializer { + ser.value(""); } - Ok(self.serializer) } #[inline] - fn accept_field(&self, field: &tracing::field::Field) -> bool { - self.state.is_none() - && field.name() == MESSAGE_FIELD + fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) { + if field.name() == MESSAGE_FIELD && !self.skipped_field_indices.contains(field.index()) + && let Some(ser) = self.serializer.take() + { + ser.value(v); + } } } -impl tracing::field::Visit for MessageFieldExtractor { +impl tracing::field::Visit for MessageFieldExtractor<'_> { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}"))); - } + self.record_field(field, format_args!("{value:x?}")); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&value)); - } + self.record_field(field, value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}"))); - } + self.record_field(field, format_args!("{value:?}")); } #[inline] @@ -742,147 +757,83 @@ impl tracing::field::Visit for MessageFieldExtracto field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { - if self.accept_field(field) { - self.state = Some(self.serializer.serialize_value(&format_args!("{value}"))); - } - } -} - -/// Checks if there's any fields and field values present. If not, the JSON subobject -/// can be skipped. -// This is entirely optional and only cosmetic, though maybe helps a -// bit during log parsing in dashboards when there's no field with empty object. -struct FieldsPresent(pub bool, SkippedFieldIndices); - -// Even though some methods have an overhead (error, bytes) it is assumed the -// compiler won't include this since we ignore the value entirely. -impl tracing::field::Visit for FieldsPresent { - #[inline] - fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) { - if !self.1.contains(field.index()) - && field.name() != MESSAGE_FIELD - && !field.name().starts_with("log.") - { - self.0 |= true; - } - } -} - -/// Serializes the fields directly supplied with a log event. -struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices); - -impl serde::ser::Serialize for SerializableEventFields<'_, '_> { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - use serde::ser::SerializeMap; - let serializer = serializer.serialize_map(None)?; - let mut message_skipper = MessageFieldSkipper::new(serializer, self.1); - self.0.record(&mut message_skipper); - let serializer = message_skipper.into_serializer()?; - serializer.end() + self.record_field(field, format_args!("{value}")); } } /// A tracing field visitor that skips the message field. -struct MessageFieldSkipper { - serializer: S, +struct MessageFieldSkipper<'buf> { + serializer: json::ObjectSer<'buf>, skipped_field_indices: SkippedFieldIndices, - state: Result<(), S::Error>, + present: bool, } -impl MessageFieldSkipper { +impl<'buf> MessageFieldSkipper<'buf> { #[inline] - fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { + fn new(serializer: json::ObjectSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, - state: Ok(()), + present: false, } } #[inline] - fn accept_field(&self, field: &tracing::field::Field) -> bool { - self.state.is_ok() - && field.name() != MESSAGE_FIELD + fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) { + if field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") && !self.skipped_field_indices.contains(field.index()) - } - - #[inline] - fn into_serializer(self) -> Result { - self.state?; - Ok(self.serializer) + { + self.serializer.entry(field.name(), v); + self.present |= true; + } } } -impl tracing::field::Visit for MessageFieldSkipper { +impl tracing::field::Visit for MessageFieldSkipper<'_> { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { - if self.accept_field(field) { - self.state = self - .serializer - .serialize_entry(field.name(), &format_args!("{value:x?}")); - } + self.record_field(field, format_args!("{value:x?}")); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { - if self.accept_field(field) { - self.state = self.serializer.serialize_entry(field.name(), &value); - } + self.record_field(field, value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { - if self.accept_field(field) { - self.state = self - .serializer - .serialize_entry(field.name(), &format_args!("{value:?}")); - } + self.record_field(field, format_args!("{value:?}")); } #[inline] @@ -891,131 +842,40 @@ impl tracing::field::Visit for MessageFieldSkipper< field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { - if self.accept_field(field) { - self.state = self.serializer.serialize_value(&format_args!("{value}")); - } - } -} - -/// Serializes the span stack from root to leaf (parent of event) as object -/// with the span names as keys. To prevent collision we append a numberic value -/// to the name. Also, collects any span fields we're interested in. Last one -/// wins. -struct SerializableSpans<'ctx, S> -where - S: for<'lookup> LookupSpan<'lookup>, -{ - spans: Vec>, - extracted: ExtractedSpanFields, -} - -impl serde::ser::Serialize for SerializableSpans<'_, S> -where - S: for<'lookup> LookupSpan<'lookup>, -{ - fn serialize(&self, serializer: Ser) -> Result - where - Ser: serde::ser::Serializer, - { - let mut serializer = serializer.serialize_map(None)?; - - for span in self.spans.iter().rev() { - let ext = span.extensions(); - - // all spans should have this extension. - let Some(fields) = ext.get() else { continue }; - - self.extracted.layer_span(fields); - - let SpanFields { values, span_info } = fields; - serializer.serialize_entry( - &*span_info.normalized_name, - &SerializableSpanFields { - fields: span.metadata().fields(), - values, - }, - )?; - } - - serializer.end() - } -} - -/// Serializes the span fields as object. -struct SerializableSpanFields<'span> { - fields: &'span tracing::field::FieldSet, - values: &'span [serde_json::Value; MAX_TRACING_FIELDS], -} - -impl serde::ser::Serialize for SerializableSpanFields<'_> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::ser::Serializer, - { - let mut serializer = serializer.serialize_map(None)?; - - for (field, value) in std::iter::zip(self.fields, self.values) { - if value.is_null() { - continue; - } - serializer.serialize_entry(field.name(), value)?; - } - - serializer.end() + self.record_field(field, format_args!("{value}")); } } struct ExtractedSpanFields { names: &'static [&'static str], - values: RefCell>, + values: Vec>, } impl ExtractedSpanFields { fn new(names: &'static [&'static str]) -> Self { ExtractedSpanFields { names, - values: RefCell::new(vec![serde_json::Value::Null; names.len()]), + values: vec![None; names.len()], } } - fn layer_span(&self, fields: &SpanFields) { - let mut v = self.values.borrow_mut(); + fn layer_span(&mut self, fields: &SpanFields) { let SpanFields { values, span_info } = fields; // extract the fields for (i, &j) in span_info.extract.iter().enumerate() { - let Some(value) = values.get(j) else { continue }; + let Some(Some(value)) = values.get(j) else { + continue; + }; - if !value.is_null() { - // TODO: replace clone with reference, if possible. - v[i] = value.clone(); - } + // TODO: replace clone with reference, if possible. + self.values[i] = Some(value.clone()); } } #[inline] fn has_values(&self) -> bool { - self.values.borrow().iter().any(|v| !v.is_null()) - } -} - -impl serde::ser::Serialize for ExtractedSpanFields { - fn serialize(&self, serializer: S) -> Result - where - S: serde::ser::Serializer, - { - let mut serializer = serializer.serialize_map(None)?; - - let values = self.values.borrow(); - for (key, value) in std::iter::zip(self.names, &*values) { - if value.is_null() { - continue; - } - - serializer.serialize_entry(key, value)?; - } - - serializer.end() + self.values.iter().any(|v| v.is_some()) } } @@ -1070,6 +930,7 @@ mod tests { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), span_info: papaya::HashMap::default(), + callsite_name_ids: papaya::HashMap::default(), writer: buffer.clone(), extract_fields: &["x"], }; @@ -1078,14 +939,16 @@ mod tests { tracing::subscriber::with_default(registry, || { info_span!("some_span", x = 24).in_scope(|| { - info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { - tracing::error!( - a = 1, - a = 2, - a = 3, - message = "explicit message field", - "implicit message field" - ); + info_span!("some_other_span", y = 30).in_scope(|| { + info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { + tracing::error!( + a = 1, + a = 2, + a = 3, + message = "explicit message field", + "implicit message field" + ); + }); }); }); }); @@ -1104,12 +967,15 @@ mod tests { "a": 3, }, "spans": { - "some_span#1":{ + "some_span":{ "x": 24, }, - "some_span#2": { + "some_other_span": { + "y": 30, + }, + "some_span#1": { "x": 42, - } + }, }, "extract": { "x": 42, diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index bf4d5a11eb..7524133093 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -112,6 +112,9 @@ pub struct ProxyMetrics { /// Number of bytes sent/received between all clients and backends. pub io_bytes: CounterVec>, + /// Number of IO errors while logging. + pub logging_errors_count: Counter, + /// Number of errors by a given classification. pub errors_total: CounterVec>, @@ -382,10 +385,10 @@ pub enum RedisMsgKind { #[derive(Default, Clone)] pub struct LatencyAccumulated { - cplane: time::Duration, - client: time::Duration, - compute: time::Duration, - retry: time::Duration, + pub cplane: time::Duration, + pub client: time::Duration, + pub compute: time::Duration, + pub retry: time::Duration, } impl std::fmt::Display for LatencyAccumulated { diff --git a/proxy/src/pqproto.rs b/proxy/src/pqproto.rs index ad99eecda5..680a23c435 100644 --- a/proxy/src/pqproto.rs +++ b/proxy/src/pqproto.rs @@ -7,7 +7,7 @@ use std::io::{self, Cursor}; use bytes::{Buf, BufMut}; use itertools::Itertools; -use rand::distributions::{Distribution, Standard}; +use rand::distr::{Distribution, StandardUniform}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use zerocopy::{FromBytes, Immutable, IntoBytes, big_endian}; @@ -458,9 +458,9 @@ impl fmt::Display for CancelKeyData { .finish() } } -impl Distribution for Standard { +impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> CancelKeyData { - id_to_cancel_key(rng.r#gen()) + id_to_cancel_key(rng.random()) } } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 9f642f52ab..ce9774e3eb 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -110,7 +110,7 @@ where debug!(error = ?err, COULD_NOT_CONNECT); let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { - // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry. + // If we just received this from cplane and not from the cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. if !should_retry(&err, num_retries, compute.retry) { Metrics::get().proxy.retries_metric.observe( diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 02651109e0..8b7c4ff55d 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -145,7 +145,7 @@ pub(crate) async fn handle_client( let session = cancellation_handler.get_key(); - finish_client_init(&pg_settings, *session.key(), client); + finish_client_init(ctx, &pg_settings, *session.key(), client, &config.greetings); let session_id = ctx.session_id(); let (cancel_on_shutdown, cancel) = oneshot::channel(); @@ -165,9 +165,11 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. pub(crate) fn finish_client_init( + ctx: &RequestContext, settings: &compute::PostgresSettings, cancel_key_data: CancelKeyData, client: &mut PqStream, + greetings: &String, ) { // Forward all deferred notices to the client. for notice in &settings.delayed_notice { @@ -176,6 +178,12 @@ pub(crate) fn finish_client_init( }); } + // Expose session_id to clients if we have a greeting message. + if !greetings.is_empty() { + let session_msg = format!("{}, session_id: {}", greetings, ctx.session_id()); + client.write_message(BeMessage::NoticeResponse(session_msg.as_str())); + } + // Forward all postgres connection params to the client. for (name, value) in &settings.params { client.write_message(BeMessage::ParameterStatus { @@ -184,6 +192,36 @@ pub(crate) fn finish_client_init( }); } + // Forward recorded latencies for probing requests + if let Some(testodrome_id) = ctx.get_testodrome_id() { + client.write_message(BeMessage::ParameterStatus { + name: "neon.testodrome_id".as_bytes(), + value: testodrome_id.as_bytes(), + }); + + let latency_measured = ctx.get_proxy_latency(); + + client.write_message(BeMessage::ParameterStatus { + name: "neon.cplane_latency".as_bytes(), + value: latency_measured.cplane.as_micros().to_string().as_bytes(), + }); + + client.write_message(BeMessage::ParameterStatus { + name: "neon.client_latency".as_bytes(), + value: latency_measured.client.as_micros().to_string().as_bytes(), + }); + + client.write_message(BeMessage::ParameterStatus { + name: "neon.compute_latency".as_bytes(), + value: latency_measured.compute.as_micros().to_string().as_bytes(), + }); + + client.write_message(BeMessage::ParameterStatus { + name: "neon.retry_latency".as_bytes(), + value: latency_measured.retry.as_micros().to_string().as_bytes(), + }); + } + client.write_message(BeMessage::BackendKeyData(cancel_key_data)); client.write_message(BeMessage::ReadyForQuery); } diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index dd89b05426..f8bff450e1 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -338,8 +338,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); use rand::Rng; - use rand::distributions::Alphanumeric; - let password: String = rand::thread_rng() + use rand::distr::Alphanumeric; + let password: String = rand::rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) .map(char::from) diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 12b4bda0c0..9de82e922c 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -3,7 +3,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use clashmap::ClashMap; -use rand::{Rng, thread_rng}; +use rand::Rng; use tokio::time::Instant; use tracing::info; use utils::leaky_bucket::LeakyBucketState; @@ -61,7 +61,7 @@ impl LeakyBucketRateLimiter { self.map.len() ); let n = self.map.shards().len(); - let shard = thread_rng().gen_range(0..n); + let shard = rand::rng().random_range(0..n); self.map.shards()[shard] .write() .retain(|(_, value)| !value.bucket_is_empty(now)); diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index fd1b2af023..2b3d745a0e 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -147,7 +147,7 @@ impl RateBucketInfo { impl BucketRateLimiter { pub fn new(info: impl Into>) -> Self { - Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) + Self::new_with_rand_and_hasher(info, StdRng::from_os_rng(), RandomState::new()) } } @@ -216,7 +216,7 @@ impl BucketRateLimiter { let n = self.map.shards().len(); // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide // (impossible, infact, unless we have 2048 threads) - let shard = self.rand.lock_propagate_poison().gen_range(0..n); + let shard = self.rand.lock_propagate_poison().random_range(0..n); self.map.shards()[shard].write().clear(); } } diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 973a4c5b02..88d5550fff 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -10,6 +10,7 @@ use super::connection_with_credentials_provider::ConnectionWithCredentialsProvid use crate::cache::project_info::ProjectInfoCache; use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; +use crate::util::deserialize_json_string; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); @@ -121,15 +122,6 @@ struct InvalidateRole { role_name: RoleNameInt, } -fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result -where - T: for<'de2> serde::Deserialize<'de2>, - D: serde::Deserializer<'de>, -{ - let s = String::deserialize(deserializer)?; - serde_json::from_str(&s).map_err(::custom) -} - // https://github.com/serde-rs/serde/issues/1714 fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error> where @@ -265,10 +257,7 @@ async fn handle_messages( return Ok(()); } let mut conn = match try_connect(&redis).await { - Ok(conn) => { - handler.cache.increment_active_listeners().await; - conn - } + Ok(conn) => conn, Err(e) => { tracing::error!( "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" @@ -287,11 +276,9 @@ async fn handle_messages( } } if cancellation_token.is_cancelled() { - handler.cache.decrement_active_listeners().await; return Ok(()); } } - handler.cache.decrement_active_listeners().await; } } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 9d56c465ec..d64895f8f5 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -86,11 +86,11 @@ mod tests { for _ in 0..n { // number to insert at once - let n = rng.gen_range(1..4096); + let n = rng.random_range(1..4096); // number of insert operations - let m = rng.gen_range(1..100); + let m = rng.random_range(1..100); - let id = uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(); + let id = uuid::Builder::from_random_bytes(rng.random()).into_uuid(); ids.push((id, n, m)); // N = sum(actual) @@ -140,8 +140,8 @@ mod tests { // probably numbers are too small to truly represent the probabilities. assert_eq!(eval_precision(100, 4096.0, 0.90), 100); assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); - assert_eq!(eval_precision(100, 4096.0, 0.1), 96); - assert_eq!(eval_precision(1000, 4096.0, 0.1), 988); + assert_eq!(eval_precision(100, 4096.0, 0.1), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 978); } // returns memory usage in bytes, and the time complexity per insert. diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 1aa402227f..ea2e29ede9 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -51,7 +51,7 @@ impl ThreadPool { *state = Some(ThreadRt { pool: pool.clone(), id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)), - rng: SmallRng::from_entropy(), + rng: SmallRng::from_os_rng(), // used to determine whether we should temporarily skip tasks for fairness. // 99% of estimates will overcount by no more than 4096 samples countmin: CountMinSketch::with_params( @@ -120,7 +120,7 @@ impl ThreadRt { // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above // are in requests per second. let probability = P.ln() / (P + rate as f64).ln(); - self.rng.gen_bool(probability) + self.rng.random_bool(probability) } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index daa6429039..59e4b09bc9 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -8,7 +8,7 @@ use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; use postgres_client::config::SslMode; -use rand::rngs::OsRng; +use rand_core::OsRng; use rustls::pki_types::{DnsName, ServerName}; use tokio::net::{TcpStream, lookup_host}; use tokio_rustls::TlsConnector; diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index ba8945afc5..142dc3b3d5 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -6,7 +6,7 @@ use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; -use rand::{Rng, thread_rng}; +use rand::distr::uniform::{UniformSampler, UniformUsize}; use rustc_hash::FxHasher; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -39,8 +39,9 @@ impl CancelSet { } pub(crate) fn take(&self) -> Option { + let dist = UniformUsize::new_inclusive(0, usize::MAX).expect("valid bounds"); for _ in 0..4 { - if let Some(token) = self.take_raw(thread_rng().r#gen()) { + if let Some(token) = self.take_raw(dist.sample(&mut rand::rng())) { return Some(token); } tracing::trace!("failed to get cancel token"); @@ -48,7 +49,7 @@ impl CancelSet { None } - pub(crate) fn take_raw(&self, rng: usize) -> Option { + fn take_raw(&self, rng: usize) -> Option { NonZeroUsize::new(self.shards.len()) .and_then(|len| self.shards[rng % len].lock().take(rng / len)) } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 672e59f81f..015c46f787 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -3,15 +3,14 @@ use std::pin::pin; use std::sync::{Arc, Weak}; use std::task::{Poll, ready}; -use futures::Future; use futures::future::poll_fn; -use postgres_client::AsyncMessage; +use futures::{Future, FutureExt}; use postgres_client::tls::MakeTlsConnect; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{Instrument, error, info, info_span, warn}; +use tracing::{error, info, info_span}; #[cfg(test)] use { super::conn_pool_lib::GlobalConnPoolOptions, @@ -85,16 +84,17 @@ pub(crate) fn poll_client( let cancel = CancellationToken::new(); let cancelled = cancel.clone().cancelled_owned(); - tokio::spawn( - async move { + tokio::spawn(async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); let mut cancelled = pin!(cancelled); poll_fn(move |cx| { + let _instrument = span.enter(); + if cancelled.as_mut().poll(cx).is_ready() { info!("connection dropped"); - return Poll::Ready(()) + return Poll::Ready(()); } match rx.has_changed() { @@ -105,7 +105,7 @@ pub(crate) fn poll_client( } Err(_) => { info!("connection dropped"); - return Poll::Ready(()) + return Poll::Ready(()); } _ => {} } @@ -123,41 +123,22 @@ pub(crate) fn poll_client( } } - loop { - let message = ready!(connection.poll_message(cx)); - - match message { - Some(Ok(AsyncMessage::Notice(notice))) => { - info!(%session_id, "notice: {}", notice); - } - Some(Ok(AsyncMessage::Notification(notif))) => { - warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received"); - } - Some(Ok(_)) => { - warn!(%session_id, "unknown message"); - } - Some(Err(e)) => { - error!(%session_id, "connection error: {}", e); - break - } - None => { - info!("connection closed"); - break - } - } + match ready!(connection.poll_unpin(cx)) { + Err(e) => error!(%session_id, "connection error: {}", e), + Ok(()) => info!("connection closed"), } // remove from connection pool if let Some(pool) = pool.clone().upgrade() - && pool.write().remove_client(db_user.clone(), conn_id) { - info!("closed connection removed"); - } + && pool.write().remove_client(db_user.clone(), conn_id) + { + info!("closed connection removed"); + } Poll::Ready(()) - }).await; - - } - .instrument(span)); + }) + .await; + }); let inner = ClientInnerCommon { inner: client, aux, diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 42a3ea17a2..ed5cc0ea03 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -428,7 +428,7 @@ where loop { interval.tick().await; - let shard = rng.gen_range(0..self.global_pool.shards().len()); + let shard = rng.random_range(0..self.global_pool.shards().len()); self.gc(shard); } } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index e4cbd02bfe..f63d84d66b 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -19,18 +19,17 @@ use std::time::Duration; use base64::Engine as _; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use ed25519_dalek::{Signature, Signer, SigningKey}; -use futures::Future; use futures::future::poll_fn; +use futures::{Future, FutureExt}; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; -use postgres_client::AsyncMessage; use postgres_client::tls::NoTlsStream; use serde_json::value::RawValue; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{Instrument, debug, error, info, info_span, warn}; +use tracing::{debug, error, info, info_span}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -186,16 +185,17 @@ pub(crate) fn poll_client( let cancel = CancellationToken::new(); let cancelled = cancel.clone().cancelled_owned(); - tokio::spawn( - async move { + tokio::spawn(async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); let mut cancelled = pin!(cancelled); poll_fn(move |cx| { + let _instrument = span.enter(); + if cancelled.as_mut().poll(cx).is_ready() { info!("connection dropped"); - return Poll::Ready(()) + return Poll::Ready(()); } match rx.has_changed() { @@ -206,7 +206,7 @@ pub(crate) fn poll_client( } Err(_) => { info!("connection dropped"); - return Poll::Ready(()) + return Poll::Ready(()); } _ => {} } @@ -218,47 +218,35 @@ pub(crate) fn poll_client( if let Some(pool) = pool.clone().upgrade() { // remove client from pool - should close the connection if it's idle. // does nothing if the client is currently checked-out and in-use - if pool.global_pool.write().remove_client(db_user.clone(), conn_id) { + if pool + .global_pool + .write() + .remove_client(db_user.clone(), conn_id) + { info!("idle connection removed"); } } } - loop { - let message = ready!(connection.poll_message(cx)); - - match message { - Some(Ok(AsyncMessage::Notice(notice))) => { - info!(%session_id, "notice: {}", notice); - } - Some(Ok(AsyncMessage::Notification(notif))) => { - warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received"); - } - Some(Ok(_)) => { - warn!(%session_id, "unknown message"); - } - Some(Err(e)) => { - error!(%session_id, "connection error: {}", e); - break - } - None => { - info!("connection closed"); - break - } - } + match ready!(connection.poll_unpin(cx)) { + Err(e) => error!(%session_id, "connection error: {}", e), + Ok(()) => info!("connection closed"), } // remove from connection pool if let Some(pool) = pool.clone().upgrade() - && pool.global_pool.write().remove_client(db_user.clone(), conn_id) { - info!("closed connection removed"); - } + && pool + .global_pool + .write() + .remove_client(db_user.clone(), conn_id) + { + info!("closed connection removed"); + } Poll::Ready(()) - }).await; - - } - .instrument(span)); + }) + .await; + }); let inner = ClientInnerCommon { inner: client, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 5b7289c53d..13f9ee2782 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -11,6 +11,8 @@ mod http_conn_pool; mod http_util; mod json; mod local_conn_pool; +#[cfg(feature = "rest_broker")] +pub mod rest; mod sql_over_http; mod websocket; @@ -75,7 +77,7 @@ pub async fn task_main( { let conn_pool = Arc::clone(&conn_pool); tokio::spawn(async move { - conn_pool.gc_worker(StdRng::from_entropy()).await; + conn_pool.gc_worker(StdRng::from_os_rng()).await; }); } @@ -95,7 +97,7 @@ pub async fn task_main( { let http_conn_pool = Arc::clone(&http_conn_pool); tokio::spawn(async move { - http_conn_pool.gc_worker(StdRng::from_entropy()).await; + http_conn_pool.gc_worker(StdRng::from_os_rng()).await; }); } @@ -487,6 +489,42 @@ async fn request_handler( .body(Empty::new().map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into())) } else { - json_response(StatusCode::BAD_REQUEST, "query is not supported") + #[cfg(feature = "rest_broker")] + { + if config.rest_config.is_rest_broker + // we are testing for the path to be /database_name/rest/... + && request + .uri() + .path() + .split('/') + .nth(2) + .is_some_and(|part| part.starts_with("rest")) + { + let ctx = + RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http); + let span = ctx.span(); + + let testodrome_id = request + .headers() + .get("X-Neon-Query-ID") + .and_then(|value| value.to_str().ok()) + .map(|s| s.to_string()); + + if let Some(query_id) = testodrome_id { + info!(parent: &span, "testodrome query ID: {query_id}"); + ctx.set_testodrome_id(query_id.into()); + } + + rest::handle(config, ctx, request, backend, http_cancellation_token) + .instrument(span) + .await + } else { + json_response(StatusCode::BAD_REQUEST, "query is not supported") + } + } + #[cfg(not(feature = "rest_broker"))] + { + json_response(StatusCode::BAD_REQUEST, "query is not supported") + } } } diff --git a/proxy/src/serverless/rest.rs b/proxy/src/serverless/rest.rs new file mode 100644 index 0000000000..173c2629f7 --- /dev/null +++ b/proxy/src/serverless/rest.rs @@ -0,0 +1,1165 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; + +use bytes::Bytes; +use http::Method; +use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST}; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; +use hyper::body::Incoming; +use hyper::http::{HeaderName, HeaderValue}; +use hyper::{Request, Response, StatusCode}; +use indexmap::IndexMap; +use ouroboros::self_referencing; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Deserializer}; +use serde_json::Value as JsonValue; +use serde_json::value::RawValue; +use subzero_core::api::ContentType::{ApplicationJSON, Other, SingularJSON, TextCSV}; +use subzero_core::api::QueryNode::{Delete, FunctionCall, Insert, Update}; +use subzero_core::api::Resolution::{IgnoreDuplicates, MergeDuplicates}; +use subzero_core::api::{ApiResponse, ListVal, Payload, Preferences, Representation, SingleVal}; +use subzero_core::config::{db_allowed_select_functions, db_schemas, role_claim_key}; +use subzero_core::dynamic_statement::{JoinIterator, param, sql}; +use subzero_core::error::Error::{ + self as SubzeroCoreError, ContentTypeError, GucHeadersError, GucStatusError, InternalError, + JsonDeserialize, JwtTokenInvalid, NotFound, +}; +use subzero_core::error::pg_error_to_status_code; +use subzero_core::formatter::Param::{LV, PL, SV, Str, StrOwned}; +use subzero_core::formatter::postgresql::{fmt_main_query, generate}; +use subzero_core::formatter::{Param, Snippet, SqlParam}; +use subzero_core::parser::postgrest::parse; +use subzero_core::permissions::{check_safe_functions, replace_select_star}; +use subzero_core::schema::{ + DbSchema, POSTGRESQL_INTROSPECTION_SQL, get_postgresql_configuration_query, +}; +use subzero_core::{content_range_header, content_range_status}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info}; +use typed_json::json; +use url::form_urlencoded; + +use super::backend::{HttpConnError, LocalProxyConnError, PoolingBackend}; +use super::conn_pool::AuthData; +use super::conn_pool_lib::ConnInfo; +use super::error::{ConnInfoError, Credentials, HttpCodeError, ReadPayloadError}; +use super::http_conn_pool::{self, Send}; +use super::http_util::{ + ALLOW_POOL, CONN_STRING, NEON_REQUEST_ID, RAW_TEXT_OUTPUT, TXN_ISOLATION_LEVEL, TXN_READ_ONLY, + get_conn_info, json_response, uuid_to_header_value, +}; +use super::json::JsonConversionError; +use crate::auth::backend::ComputeCredentialKeys; +use crate::cache::{Cached, TimedLru}; +use crate::config::ProxyConfig; +use crate::context::RequestContext; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::http::read_body_with_limit; +use crate::metrics::Metrics; +use crate::serverless::sql_over_http::HEADER_VALUE_TRUE; +use crate::types::EndpointCacheKey; +use crate::util::deserialize_json_string; + +static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#; +const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL; + +// A wrapper around the DbSchema that allows for self-referencing +#[self_referencing] +pub struct DbSchemaOwned { + schema_string: String, + #[covariant] + #[borrows(schema_string)] + schema: DbSchema<'this>, +} + +impl<'de> Deserialize<'de> for DbSchemaOwned { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + DbSchemaOwned::try_new(s, |s| serde_json::from_str(s)) + .map_err(::custom) + } +} + +fn split_comma_separated(s: &str) -> Vec { + s.split(',').map(|s| s.trim().to_string()).collect() +} + +fn deserialize_comma_separated<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + Ok(split_comma_separated(&s)) +} + +fn deserialize_comma_separated_option<'de, D>( + deserializer: D, +) -> Result>, D::Error> +where + D: Deserializer<'de>, +{ + let opt = Option::::deserialize(deserializer)?; + if let Some(s) = &opt { + let trimmed = s.trim(); + if trimmed.is_empty() { + return Ok(None); + } + return Ok(Some(split_comma_separated(trimmed))); + } + Ok(None) +} + +// The ApiConfig is the configuration for the API per endpoint +// The configuration is read from the database and cached in the DbSchemaCache +#[derive(Deserialize, Debug)] +pub struct ApiConfig { + #[serde( + default = "db_schemas", + deserialize_with = "deserialize_comma_separated" + )] + pub db_schemas: Vec, + pub db_anon_role: Option, + pub db_max_rows: Option, + #[serde(default = "db_allowed_select_functions")] + pub db_allowed_select_functions: Vec, + // #[serde(deserialize_with = "to_tuple", default)] + // pub db_pre_request: Option<(String, String)>, + #[allow(dead_code)] + #[serde(default = "role_claim_key")] + pub role_claim_key: String, + #[serde(default, deserialize_with = "deserialize_comma_separated_option")] + pub db_extra_search_path: Option>, +} + +// The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint +pub(crate) type DbSchemaCache = TimedLru>; +impl DbSchemaCache { + pub async fn get_cached_or_remote( + &self, + endpoint_id: &EndpointCacheKey, + auth_header: &HeaderValue, + connection_string: &str, + client: &mut http_conn_pool::Client, + ctx: &RequestContext, + config: &'static ProxyConfig, + ) -> Result, RestError> { + match self.get_with_created_at(endpoint_id) { + Some(Cached { value: (v, _), .. }) => Ok(v), + None => { + info!("db_schema cache miss for endpoint: {:?}", endpoint_id); + let remote_value = self + .get_remote(auth_header, connection_string, client, ctx, config) + .await; + let (api_config, schema_owned) = match remote_value { + Ok((api_config, schema_owned)) => (api_config, schema_owned), + Err(e @ RestError::SchemaTooLarge) => { + // for the case where the schema is too large, we cache an empty dummy value + // all the other requests will fail without triggering the introspection query + let schema_owned = serde_json::from_str::(EMPTY_JSON_SCHEMA) + .map_err(|e| JsonDeserialize { source: e })?; + + let api_config = ApiConfig { + db_schemas: vec![], + db_anon_role: None, + db_max_rows: None, + db_allowed_select_functions: vec![], + role_claim_key: String::new(), + db_extra_search_path: None, + }; + let value = Arc::new((api_config, schema_owned)); + self.insert(endpoint_id.clone(), value); + return Err(e); + } + Err(e) => { + return Err(e); + } + }; + let value = Arc::new((api_config, schema_owned)); + self.insert(endpoint_id.clone(), value.clone()); + Ok(value) + } + } + } + pub async fn get_remote( + &self, + auth_header: &HeaderValue, + connection_string: &str, + client: &mut http_conn_pool::Client, + ctx: &RequestContext, + config: &'static ProxyConfig, + ) -> Result<(ApiConfig, DbSchemaOwned), RestError> { + #[derive(Deserialize)] + struct SingleRow { + rows: [Row; 1], + } + + #[derive(Deserialize)] + struct ConfigRow { + #[serde(deserialize_with = "deserialize_json_string")] + config: ApiConfig, + } + + #[derive(Deserialize)] + struct SchemaRow { + json_schema: DbSchemaOwned, + } + + let headers = vec![ + (&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())), + ( + &CONN_STRING, + HeaderValue::from_str(connection_string).expect( + "connection string came from a header, so it must be a valid headervalue", + ), + ), + (&AUTHORIZATION, auth_header.clone()), + (&RAW_TEXT_OUTPUT, HEADER_VALUE_TRUE), + ]; + + let query = get_postgresql_configuration_query(Some("pgrst.pre_config")); + let SingleRow { + rows: [ConfigRow { config: api_config }], + } = make_local_proxy_request( + client, + headers.iter().cloned(), + QueryData { + query: Cow::Owned(query), + params: vec![], + }, + config.rest_config.max_schema_size, + ) + .await + .map_err(|e| match e { + RestError::ReadPayload(ReadPayloadError::BodyTooLarge { .. }) => { + RestError::SchemaTooLarge + } + e => e, + })?; + + // now that we have the api_config let's run the second INTROSPECTION_SQL query + let SingleRow { + rows: [SchemaRow { json_schema }], + } = make_local_proxy_request( + client, + headers, + QueryData { + query: INTROSPECTION_SQL.into(), + params: vec![ + serde_json::to_value(&api_config.db_schemas) + .expect("Vec is always valid to encode as JSON"), + JsonValue::Bool(false), // include_roles_with_login + JsonValue::Bool(false), // use_internal_permissions + ], + }, + config.rest_config.max_schema_size, + ) + .await + .map_err(|e| match e { + RestError::ReadPayload(ReadPayloadError::BodyTooLarge { .. }) => { + RestError::SchemaTooLarge + } + e => e, + })?; + + Ok((api_config, json_schema)) + } +} + +// A type to represent a postgresql errors +// we use our own type (instead of postgres_client::Error) because we get the error from the json response +#[derive(Debug, thiserror::Error, Deserialize)] +pub(crate) struct PostgresError { + pub code: String, + pub message: String, + pub detail: Option, + pub hint: Option, +} +impl HttpCodeError for PostgresError { + fn get_http_status_code(&self) -> StatusCode { + let status = pg_error_to_status_code(&self.code, true); + StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) + } +} +impl ReportableError for PostgresError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} +impl UserFacingError for PostgresError { + fn to_string_client(&self) -> String { + if self.code.starts_with("PT") { + "Postgres error".to_string() + } else { + self.message.clone() + } + } +} +impl std::fmt::Display for PostgresError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +// A type to represent errors that can occur in the rest broker +#[derive(Debug, thiserror::Error)] +pub(crate) enum RestError { + #[error(transparent)] + ReadPayload(#[from] ReadPayloadError), + #[error(transparent)] + ConnectCompute(#[from] HttpConnError), + #[error(transparent)] + ConnInfo(#[from] ConnInfoError), + #[error(transparent)] + Postgres(#[from] PostgresError), + #[error(transparent)] + JsonConversion(#[from] JsonConversionError), + #[error(transparent)] + SubzeroCore(#[from] SubzeroCoreError), + #[error("schema is too large")] + SchemaTooLarge, +} +impl ReportableError for RestError { + fn get_error_kind(&self) -> ErrorKind { + match self { + RestError::ReadPayload(e) => e.get_error_kind(), + RestError::ConnectCompute(e) => e.get_error_kind(), + RestError::ConnInfo(e) => e.get_error_kind(), + RestError::Postgres(_) => ErrorKind::Postgres, + RestError::JsonConversion(_) => ErrorKind::Postgres, + RestError::SubzeroCore(_) => ErrorKind::User, + RestError::SchemaTooLarge => ErrorKind::User, + } + } +} +impl UserFacingError for RestError { + fn to_string_client(&self) -> String { + match self { + RestError::ReadPayload(p) => p.to_string(), + RestError::ConnectCompute(c) => c.to_string_client(), + RestError::ConnInfo(c) => c.to_string_client(), + RestError::SchemaTooLarge => self.to_string(), + RestError::Postgres(p) => p.to_string_client(), + RestError::JsonConversion(_) => "could not parse postgres response".to_string(), + RestError::SubzeroCore(s) => { + // TODO: this is a hack to get the message from the json body + let json = s.json_body(); + let default_message = "Unknown error".to_string(); + + json.get("message") + .map_or(default_message.clone(), |m| match m { + JsonValue::String(s) => s.clone(), + _ => default_message, + }) + } + } + } +} +impl HttpCodeError for RestError { + fn get_http_status_code(&self) -> StatusCode { + match self { + RestError::ReadPayload(e) => e.get_http_status_code(), + RestError::ConnectCompute(h) => match h.get_error_kind() { + ErrorKind::User => StatusCode::BAD_REQUEST, + _ => StatusCode::INTERNAL_SERVER_ERROR, + }, + RestError::ConnInfo(_) => StatusCode::BAD_REQUEST, + RestError::Postgres(e) => e.get_http_status_code(), + RestError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR, + RestError::SchemaTooLarge => StatusCode::INTERNAL_SERVER_ERROR, + RestError::SubzeroCore(e) => { + let status = e.status_code(); + StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) + } + } + } +} + +// Helper functions for the rest broker + +fn fmt_env_query<'a>(env: &'a HashMap<&'a str, &'a str>) -> Snippet<'a> { + "select " + + if env.is_empty() { + sql("null") + } else { + env.iter() + .map(|(k, v)| { + "set_config(" + param(k as &SqlParam) + ", " + param(v as &SqlParam) + ", true)" + }) + .join(",") + } +} + +// TODO: see about removing the need for cloning the values (inner things are &Cow already) +fn to_sql_param(p: &Param) -> JsonValue { + match p { + SV(SingleVal(v, ..)) => JsonValue::String(v.to_string()), + Str(v) => JsonValue::String((*v).to_string()), + StrOwned(v) => JsonValue::String((*v).clone()), + PL(Payload(v, ..)) => JsonValue::String(v.clone().into_owned()), + LV(ListVal(v, ..)) => { + if v.is_empty() { + JsonValue::String(r"{}".to_string()) + } else { + JsonValue::String(format!( + "{{\"{}\"}}", + v.iter() + .map(|e| e.replace('\\', "\\\\").replace('\"', "\\\"")) + .collect::>() + .join("\",\"") + )) + } + } + } +} + +#[derive(serde::Serialize)] +struct QueryData<'a> { + query: Cow<'a, str>, + params: Vec, +} + +#[derive(serde::Serialize)] +struct BatchQueryData<'a> { + queries: Vec>, +} + +async fn make_local_proxy_request( + client: &mut http_conn_pool::Client, + headers: impl IntoIterator, + body: QueryData<'_>, + max_len: usize, +) -> Result { + let body_string = serde_json::to_string(&body) + .map_err(|e| RestError::JsonConversion(JsonConversionError::ParseJsonError(e)))?; + + let response = make_raw_local_proxy_request(client, headers, body_string).await?; + + let response_status = response.status(); + + if response_status != StatusCode::OK { + return Err(RestError::SubzeroCore(InternalError { + message: "Failed to get endpoint schema".to_string(), + })); + } + + // Capture the response body + let response_body = crate::http::read_body_with_limit(response.into_body(), max_len) + .await + .map_err(ReadPayloadError::from)?; + + // Parse the JSON response + let response_json: S = serde_json::from_slice(&response_body) + .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; + + Ok(response_json) +} + +async fn make_raw_local_proxy_request( + client: &mut http_conn_pool::Client, + headers: impl IntoIterator, + body: String, +) -> Result, RestError> { + let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); + let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri); + let req_headers = req.headers_mut().expect("failed to get headers"); + // Add all provided headers to the request + for (header_name, header_value) in headers { + req_headers.insert(header_name, header_value.clone()); + } + + let body_boxed = Full::new(Bytes::from(body)) + .map_err(|never| match never {}) // Convert Infallible to hyper::Error + .boxed(); + + let req = req.body(body_boxed).map_err(|_| { + RestError::SubzeroCore(InternalError { + message: "Failed to build request".to_string(), + }) + })?; + + // Send the request to the local proxy + client + .inner + .inner + .send_request(req) + .await + .map_err(LocalProxyConnError::from) + .map_err(HttpConnError::from) + .map_err(RestError::from) +} + +pub(crate) async fn handle( + config: &'static ProxyConfig, + ctx: RequestContext, + request: Request, + backend: Arc, + cancel: CancellationToken, +) -> Result>, ApiError> { + let result = handle_inner(cancel, config, &ctx, request, backend).await; + + let mut response = match result { + Ok(r) => { + ctx.set_success(); + + // Handling the error response from local proxy here + if r.status().is_server_error() { + let status = r.status(); + + let body_bytes = r + .collect() + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::Error::msg(format!( + "could not collect http body: {e}" + ))) + })? + .to_bytes(); + + if let Ok(mut json_map) = + serde_json::from_slice::>(&body_bytes) + { + let message = json_map.get("message"); + if let Some(message) = message { + let msg: String = match serde_json::from_str(message.get()) { + Ok(msg) => msg, + Err(_) => { + "Unable to parse the response message from server".to_string() + } + }; + + error!("Error response from local_proxy: {status} {msg}"); + + json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys + + let resp_json = serde_json::to_string(&json_map) + .unwrap_or("failed to serialize the response message".to_string()); + + return json_response(status, resp_json); + } + } + + error!("Unable to parse the response message from local_proxy"); + return json_response( + status, + json!({ "message": "Unable to parse the response message from server".to_string() }), + ); + } + r + } + Err(e @ RestError::SubzeroCore(_)) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg="subzero core error", + "forwarding error to user" + ); + + let RestError::SubzeroCore(subzero_err) = e else { + panic!("expected subzero core error") + }; + + let json_body = subzero_err.json_body(); + let status_code = StatusCode::from_u16(subzero_err.status_code()) + .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR); + + json_response(status_code, json_body)? + } + Err(e) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); + + let message = e.to_string_client(); + let status_code = e.get_http_status_code(); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + + let (code, detail, hint) = match e { + RestError::Postgres(e) => ( + if e.code.starts_with("PT") { + None + } else { + Some(e.code) + }, + e.detail, + e.hint, + ), + _ => (None, None, None), + }; + + json_response( + status_code, + json!({ + "message": message, + "code": code, + "detail": detail, + "hint": hint, + }), + )? + } + }; + + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); + Ok(response) +} + +async fn handle_inner( + _cancel: CancellationToken, + config: &'static ProxyConfig, + ctx: &RequestContext, + request: Request, + backend: Arc, +) -> Result>, RestError> { + let _requeset_gauge = Metrics::get() + .proxy + .connection_requests + .guard(ctx.protocol()); + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + // Read host from Host, then URI host as fallback + // TODO: will this be a problem if behind a load balancer? + // TODO: can we use the x-forwarded-host header? + let host = request + .headers() + .get(HOST) + .and_then(|v| v.to_str().ok()) + .unwrap_or_else(|| request.uri().host().unwrap_or("")); + + // a valid path is /database/rest/v1/... so splitting should be ["", "database", "rest", "v1", ...] + let database_name = request + .uri() + .path() + .split('/') + .nth(1) + .ok_or(RestError::SubzeroCore(NotFound { + target: request.uri().path().to_string(), + }))?; + + // we always use the authenticator role to connect to the database + let authenticator_role = "authenticator"; + + // Strip the hostname prefix from the host to get the database hostname + let database_host = host.replace(&config.rest_config.hostname_prefix, ""); + + let connection_string = + format!("postgresql://{authenticator_role}@{database_host}/{database_name}"); + + let conn_info = get_conn_info( + &config.authentication_config, + ctx, + Some(&connection_string), + request.headers(), + )?; + info!( + user = conn_info.conn_info.user_info.user.as_str(), + "credentials" + ); + + match conn_info.auth { + AuthData::Jwt(jwt) => { + let api_prefix = format!("/{database_name}/rest/v1/"); + handle_rest_inner( + config, + ctx, + &api_prefix, + request, + &connection_string, + conn_info.conn_info, + jwt, + backend, + ) + .await + } + AuthData::Password(_) => Err(RestError::ConnInfo(ConnInfoError::MissingCredentials( + Credentials::BearerJwt, + ))), + } +} + +#[allow(clippy::too_many_arguments)] +async fn handle_rest_inner( + config: &'static ProxyConfig, + ctx: &RequestContext, + api_prefix: &str, + request: Request, + connection_string: &str, + conn_info: ConnInfo, + jwt: String, + backend: Arc, +) -> Result>, RestError> { + // validate the jwt token + let jwt_parsed = backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await + .map_err(HttpConnError::from)?; + + let db_schema_cache = + config + .rest_config + .db_schema_cache + .as_ref() + .ok_or(RestError::SubzeroCore(InternalError { + message: "DB schema cache is not configured".to_string(), + }))?; + + let endpoint_cache_key = conn_info + .endpoint_cache_key() + .ok_or(RestError::SubzeroCore(InternalError { + message: "Failed to get endpoint cache key".to_string(), + }))?; + + let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; + + let (parts, originial_body) = request.into_parts(); + + let auth_header = parts + .headers + .get(AUTHORIZATION) + .ok_or(RestError::SubzeroCore(InternalError { + message: "Authorization header is required".to_string(), + }))?; + + let entry = db_schema_cache + .get_cached_or_remote( + &endpoint_cache_key, + auth_header, + connection_string, + &mut client, + ctx, + config, + ) + .await?; + let (api_config, db_schema_owned) = entry.as_ref(); + let db_schema = db_schema_owned.borrow_schema(); + + let db_schemas = &api_config.db_schemas; // list of schemas available for the api + let db_extra_search_path = &api_config.db_extra_search_path; + // TODO: use this when we get a replacement for jsonpath_lib + // let role_claim_key = &api_config.role_claim_key; + // let role_claim_path = format!("${role_claim_key}"); + let db_anon_role = &api_config.db_anon_role; + let max_rows = api_config.db_max_rows.as_deref(); + let db_allowed_select_functions = api_config + .db_allowed_select_functions + .iter() + .map(|s| s.as_str()) + .collect::>(); + + // extract the jwt claims (we'll need them later to set the role and env) + let jwt_claims = match jwt_parsed.keys { + ComputeCredentialKeys::JwtPayload(payload_bytes) => { + // `payload_bytes` contains the raw JWT payload as Vec + // You can deserialize it back to JSON or parse specific claims + let payload: serde_json::Value = serde_json::from_slice(&payload_bytes) + .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; + Some(payload) + } + ComputeCredentialKeys::AuthKeys(_) => None, + }; + + // read the role from the jwt claims (and set it to the "anon" role if not present) + let (role, authenticated) = match &jwt_claims { + Some(claims) => match claims.get("role") { + Some(JsonValue::String(r)) => (Some(r), true), + _ => (db_anon_role.as_ref(), true), + }, + None => (db_anon_role.as_ref(), false), + }; + + // do not allow unauthenticated requests when there is no anonymous role setup + if let (None, false) = (role, authenticated) { + return Err(RestError::SubzeroCore(JwtTokenInvalid { + message: "unauthenticated requests not allowed".to_string(), + })); + } + + // start deconstructing the request because subzero core mostly works with &str + let method = parts.method; + let method_str = method.as_str(); + let path = parts.uri.path_and_query().map_or("/", |pq| pq.as_str()); + + // this is actually the table name (or rpc/function_name) + // TODO: rename this to something more descriptive + let root = match parts.uri.path().strip_prefix(api_prefix) { + Some(p) => Ok(p), + None => Err(RestError::SubzeroCore(NotFound { + target: parts.uri.path().to_string(), + })), + }?; + + // pick the current schema from the headers (or the first one from config) + let schema_name = &DbSchema::pick_current_schema(db_schemas, method_str, &parts.headers)?; + + // add the content-profile header to the response + let mut response_headers = vec![]; + if db_schemas.len() > 1 { + response_headers.push(("Content-Profile".to_string(), schema_name.clone())); + } + + // parse the query string into a Vec<(&str, &str)> + let query = match parts.uri.query() { + Some(q) => form_urlencoded::parse(q.as_bytes()).collect(), + None => vec![], + }; + let get: Vec<(&str, &str)> = query.iter().map(|(k, v)| (&**k, &**v)).collect(); + + // convert the headers map to a HashMap<&str, &str> + let headers: HashMap<&str, &str> = parts + .headers + .iter() + .map(|(k, v)| (k.as_str(), v.to_str().unwrap_or("__BAD_HEADER__"))) + .collect(); + + let cookies = HashMap::new(); // TODO: add cookies + + // Read the request body (skip for GET requests) + let body_as_string: Option = if method == Method::GET { + None + } else { + let body_bytes = + read_body_with_limit(originial_body, config.http_config.max_request_size_bytes) + .await + .map_err(ReadPayloadError::from)?; + if body_bytes.is_empty() { + None + } else { + Some(String::from_utf8_lossy(&body_bytes).into_owned()) + } + }; + + // parse the request into an ApiRequest struct + let mut api_request = parse( + schema_name, + root, + db_schema, + method_str, + path, + get, + body_as_string.as_deref(), + headers, + cookies, + max_rows, + ) + .map_err(RestError::SubzeroCore)?; + + let role_str = match role { + Some(r) => r, + None => "", + }; + + replace_select_star(db_schema, schema_name, role_str, &mut api_request.query)?; + + // TODO: this is not relevant when acting as PostgREST but will be useful + // in the context of DBX where they need internal permissions + // if !disable_internal_permissions { + // check_privileges(db_schema, schema_name, role_str, &api_request)?; + // } + + check_safe_functions(&api_request, &db_allowed_select_functions)?; + + // TODO: this is not relevant when acting as PostgREST but will be useful + // in the context of DBX where they need internal permissions + // if !disable_internal_permissions { + // insert_policy_conditions(db_schema, schema_name, role_str, &mut api_request.query)?; + // } + + let env_role = Some(role_str); + + // construct the env (passed in to the sql context as GUCs) + let empty_json = "{}".to_string(); + let headers_env = serde_json::to_string(&api_request.headers).unwrap_or(empty_json.clone()); + let cookies_env = serde_json::to_string(&api_request.cookies).unwrap_or(empty_json.clone()); + let get_env = serde_json::to_string(&api_request.get).unwrap_or(empty_json.clone()); + let jwt_claims_env = jwt_claims + .as_ref() + .map(|v| serde_json::to_string(v).unwrap_or(empty_json.clone())) + .unwrap_or(if let Some(r) = env_role { + let claims: HashMap<&str, &str> = HashMap::from([("role", r)]); + serde_json::to_string(&claims).unwrap_or(empty_json.clone()) + } else { + empty_json.clone() + }); + let mut search_path = vec![api_request.schema_name]; + if let Some(extra) = &db_extra_search_path { + search_path.extend(extra.iter().map(|s| s.as_str())); + } + let search_path_str = search_path + .into_iter() + .filter(|s| !s.is_empty()) + .collect::>() + .join(","); + let mut env: HashMap<&str, &str> = HashMap::from([ + ("request.method", api_request.method), + ("request.path", api_request.path), + ("request.headers", &headers_env), + ("request.cookies", &cookies_env), + ("request.get", &get_env), + ("request.jwt.claims", &jwt_claims_env), + ("search_path", &search_path_str), + ]); + if let Some(r) = env_role { + env.insert("role", r); + } + + // generate the sql statements + let (env_statement, env_parameters, _) = generate(fmt_env_query(&env)); + let (main_statement, main_parameters, _) = generate(fmt_main_query( + db_schema, + api_request.schema_name, + &api_request, + &env, + )?); + + let mut headers = vec![ + (&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())), + ( + &CONN_STRING, + HeaderValue::from_str(connection_string).expect("invalid connection string"), + ), + (&AUTHORIZATION, auth_header.clone()), + ( + &TXN_ISOLATION_LEVEL, + HeaderValue::from_static("ReadCommitted"), + ), + (&ALLOW_POOL, HEADER_VALUE_TRUE), + ]; + + if api_request.read_only { + headers.push((&TXN_READ_ONLY, HEADER_VALUE_TRUE)); + } + + // convert the parameters from subzero core representation to the local proxy repr. + let req_body = serde_json::to_string(&BatchQueryData { + queries: vec![ + QueryData { + query: env_statement.into(), + params: env_parameters + .iter() + .map(|p| to_sql_param(&p.to_param())) + .collect(), + }, + QueryData { + query: main_statement.into(), + params: main_parameters + .iter() + .map(|p| to_sql_param(&p.to_param())) + .collect(), + }, + ], + }) + .map_err(|e| RestError::JsonConversion(JsonConversionError::ParseJsonError(e)))?; + + // todo: map body to count egress + let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly? + + // send the request to the local proxy + let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?; + let (parts, body) = response.into_parts(); + + let max_response = config.http_config.max_response_size_bytes; + let bytes = read_body_with_limit(body, max_response) + .await + .map_err(ReadPayloadError::from)?; + + // if the response status is greater than 399, then it is an error + // FIXME: check if there are other error codes or shapes of the response + if parts.status.as_u16() > 399 { + // turn this postgres error from the json into PostgresError + let postgres_error = serde_json::from_slice(&bytes) + .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; + + return Err(RestError::Postgres(postgres_error)); + } + + #[derive(Deserialize)] + struct QueryResults { + /// we run two queries, so we want only two results. + results: (EnvRows, MainRows), + } + + /// `env_statement` returns nothing of interest to us + #[derive(Deserialize)] + struct EnvRows {} + + #[derive(Deserialize)] + struct MainRows { + /// `main_statement` only returns a single row. + rows: [MainRow; 1], + } + + #[derive(Deserialize)] + struct MainRow { + body: String, + page_total: Option, + total_result_set: Option, + response_headers: Option, + response_status: Option, + } + + let results: QueryResults = serde_json::from_slice(&bytes) + .map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?; + + let QueryResults { + results: (_, MainRows { rows: [row] }), + } = results; + + // build the intermediate response object + let api_response = ApiResponse { + page_total: row.page_total.map_or(0, |v| v.parse::().unwrap_or(0)), + total_result_set: row.total_result_set.map(|v| v.parse::().unwrap_or(0)), + top_level_offset: 0, // FIXME: check why this is 0 + response_headers: row.response_headers, + response_status: row.response_status, + body: row.body, + }; + + // TODO: rollback the transaction if the page_total is not 1 and the accept_content_type is SingularJSON + // we can not do this in the context of proxy for now + // if api_request.accept_content_type == SingularJSON && api_response.page_total != 1 { + // // rollback the transaction here + // return Err(RestError::SubzeroCore(SingularityError { + // count: api_response.page_total, + // content_type: "application/vnd.pgrst.object+json".to_string(), + // })); + // } + + // TODO: rollback the transaction if the page_total is not 1 and the method is PUT + // we can not do this in the context of proxy for now + // if api_request.method == Method::PUT && api_response.page_total != 1 { + // // Makes sure the querystring pk matches the payload pk + // // e.g. PUT /items?id=eq.1 { "id" : 1, .. } is accepted, + // // PUT /items?id=eq.14 { "id" : 2, .. } is rejected. + // // If this condition is not satisfied then nothing is inserted, + // // rollback the transaction here + // return Err(RestError::SubzeroCore(PutMatchingPkError)); + // } + + // create and return the response to the client + // this section mostly deals with setting the right headers according to PostgREST specs + let page_total = api_response.page_total; + let total_result_set = api_response.total_result_set; + let top_level_offset = api_response.top_level_offset; + let response_content_type = match (&api_request.accept_content_type, &api_request.query.node) { + (SingularJSON, _) + | ( + _, + FunctionCall { + returns_single: true, + is_scalar: false, + .. + }, + ) => SingularJSON, + (TextCSV, _) => TextCSV, + _ => ApplicationJSON, + }; + + // check if the SQL env set some response headers (happens when we called a rpc function) + if let Some(response_headers_str) = api_response.response_headers { + let Ok(headers_json) = + serde_json::from_str::>>(response_headers_str.as_str()) + else { + return Err(RestError::SubzeroCore(GucHeadersError)); + }; + + response_headers.extend(headers_json.into_iter().flatten()); + } + + // calculate and set the content range header + let lower = top_level_offset as i64; + let upper = top_level_offset as i64 + page_total as i64 - 1; + let total = total_result_set.map(|t| t as i64); + let content_range = match (&method, &api_request.query.node) { + (&Method::POST, Insert { .. }) => content_range_header(1, 0, total), + (&Method::DELETE, Delete { .. }) => content_range_header(1, upper, total), + _ => content_range_header(lower, upper, total), + }; + response_headers.push(("Content-Range".to_string(), content_range)); + + // calculate the status code + #[rustfmt::skip] + let mut status = match (&method, &api_request.query.node, page_total, &api_request.preferences) { + (&Method::POST, Insert { .. }, ..) => 201, + (&Method::DELETE, Delete { .. }, _, Some(Preferences {representation: Some(Representation::Full),..}),) => 200, + (&Method::DELETE, Delete { .. }, ..) => 204, + (&Method::PATCH, Update { columns, .. }, 0, _) if !columns.is_empty() => 404, + (&Method::PATCH, Update { .. }, _,Some(Preferences {representation: Some(Representation::Full),..}),) => 200, + (&Method::PATCH, Update { .. }, ..) => 204, + (&Method::PUT, Insert { .. },_,Some(Preferences {representation: Some(Representation::Full),..}),) => 200, + (&Method::PUT, Insert { .. }, ..) => 204, + _ => content_range_status(lower, upper, total), + }; + + // add the preference-applied header + if let Some(Preferences { + resolution: Some(r), + .. + }) = api_request.preferences + { + response_headers.push(( + "Preference-Applied".to_string(), + match r { + MergeDuplicates => "resolution=merge-duplicates".to_string(), + IgnoreDuplicates => "resolution=ignore-duplicates".to_string(), + }, + )); + } + + // check if the SQL env set some response status (happens when we called a rpc function) + if let Some(response_status_str) = api_response.response_status { + status = response_status_str + .parse::() + .map_err(|_| RestError::SubzeroCore(GucStatusError))?; + } + + // set the content type header + // TODO: move this to a subzero function + // as_header_value(&self) -> Option<&str> + let http_content_type = match response_content_type { + SingularJSON => Ok("application/vnd.pgrst.object+json"), + TextCSV => Ok("text/csv"), + ApplicationJSON => Ok("application/json"), + Other(t) => Err(RestError::SubzeroCore(ContentTypeError { + message: format!("None of these Content-Types are available: {t}"), + })), + }?; + + // build the response body + let response_body = Full::new(Bytes::from(api_response.body)) + .map_err(|never| match never {}) + .boxed(); + + // build the response + let mut response = Response::builder() + .status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)) + .header(CONTENT_TYPE, http_content_type); + + // Add all headers from response_headers vector + for (header_name, header_value) in response_headers { + response = response.header(header_name, header_value); + } + + // add the body and return the response + response.body(response_body).map_err(|_| { + RestError::SubzeroCore(InternalError { + message: "Failed to build response".to_string(), + }) + }) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8a14f804b6..f254b41b5b 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -64,7 +64,7 @@ enum Payload { Batch(BatchQueryData), } -static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); +pub(super) const HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> where diff --git a/proxy/src/util.rs b/proxy/src/util.rs index 0291216d94..c89ebab008 100644 --- a/proxy/src/util.rs +++ b/proxy/src/util.rs @@ -20,3 +20,13 @@ pub async fn run_until( Either::Right((f2, _)) => Err(f2), } } + +pub fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result +where + T: for<'de2> serde::Deserialize<'de2>, + D: serde::Deserializer<'de>, +{ + use serde::Deserialize; + let s = String::deserialize(deserializer)?; + serde_json::from_str(&s).map_err(::custom) +} diff --git a/proxy/subzero_core/.gitignore b/proxy/subzero_core/.gitignore new file mode 100644 index 0000000000..f2f9e58ec3 --- /dev/null +++ b/proxy/subzero_core/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock \ No newline at end of file diff --git a/proxy/subzero_core/Cargo.toml b/proxy/subzero_core/Cargo.toml new file mode 100644 index 0000000000..13185873d0 --- /dev/null +++ b/proxy/subzero_core/Cargo.toml @@ -0,0 +1,12 @@ +# This is a stub for the subzero-core crate. +[package] +name = "subzero-core" +version = "3.0.1" +edition = "2024" +publish = false # "private"! + +[features] +default = [] +postgresql = [] + +[dependencies] diff --git a/proxy/subzero_core/src/lib.rs b/proxy/subzero_core/src/lib.rs new file mode 100644 index 0000000000..b99246b98b --- /dev/null +++ b/proxy/subzero_core/src/lib.rs @@ -0,0 +1 @@ +// This is a stub for the subzero-core crate. diff --git a/pyproject.toml b/pyproject.toml index e992e81fe7..7631a05942 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ types-pyyaml = "^6.0.12.20240917" testcontainers = "^4.9.0" # Install a release candidate of `jsonnet`, as it supports Python 3.13 jsonnet = "^0.21.0-rc2" +requests-unixsocket = "^0.4.1" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 6955028c73..56822b5c25 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -58,6 +58,7 @@ metrics.workspace = true pem.workspace = true postgres_backend.workspace = true postgres_ffi.workspace = true +postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true pq_proto.workspace = true remote_storage.workspace = true @@ -71,6 +72,7 @@ http-utils.workspace = true utils.workspace = true wal_decoder.workspace = true env_logger.workspace = true +nix.workspace = true workspace_hack.workspace = true diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index 81c79fae30..008f903a89 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -21,7 +21,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< | Scope::GenerationsApi | Scope::Infra | Scope::Scrubber - | Scope::ControllerPeer, + | Scope::ControllerPeer + | Scope::TenantEndpoint, _, ) => Err(AuthError( format!( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 79cf2f9149..2ec541b6f0 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -17,8 +17,9 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::set_build_info_metric; use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ - DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, + DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, + DEFAULT_GLOBAL_DISK_CHECK_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, + DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, @@ -42,6 +43,12 @@ use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; +use safekeeper::hadron::{ + GLOBAL_DISK_LIMIT_EXCEEDED, get_filesystem_capacity, get_filesystem_usage, +}; +use safekeeper::metrics::GLOBAL_DISK_UTIL_CHECK_SECONDS; +use std::sync::atomic::Ordering; + #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -256,6 +263,15 @@ struct Args { /* BEGIN_HADRON */ #[arg(long)] enable_pull_timeline_on_startup: bool, + /// How often to scan entire data-dir for total disk usage + #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)] + global_disk_check_interval: Duration, + /// The portion of the filesystem capacity that can be used by all timelines. + /// A circuit breaker will trip and reject all WAL writes if the total usage + /// exceeds this ratio. + /// Set to 0 to disable the global disk usage limit. + #[arg(long, default_value_t = DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO)] + max_global_disk_usage_ratio: f64, /* END_HADRON */ } @@ -444,6 +460,8 @@ async fn main() -> anyhow::Result<()> { advertise_pg_addr_tenant_only: None, enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup, hcc_base_url: None, + global_disk_check_interval: args.global_disk_check_interval, + max_global_disk_usage_ratio: args.max_global_disk_usage_ratio, /* END_HADRON */ }); @@ -618,6 +636,49 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { .map(|res| ("Timeline map housekeeping".to_owned(), res)); tasks_handles.push(Box::pin(timeline_housekeeping_handle)); + /* BEGIN_HADRON */ + // Spawn global disk usage watcher task, if a global disk usage limit is specified. + let interval = conf.global_disk_check_interval; + let data_dir = conf.workdir.clone(); + // Use the safekeeper data directory to compute filesystem capacity. This only runs once on startup, so + // there is little point to continue if we can't have the proper protections in place. + let fs_capacity_bytes = get_filesystem_capacity(data_dir.as_std_path()) + .expect("Failed to get filesystem capacity for data directory"); + let limit: u64 = (conf.max_global_disk_usage_ratio * fs_capacity_bytes as f64) as u64; + if limit > 0 { + let disk_usage_watch_handle = BACKGROUND_RUNTIME + .handle() + .spawn(async move { + // Use Tokio interval to preserve fixed cadence between filesystem utilization checks + let mut ticker = tokio::time::interval(interval); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + + loop { + ticker.tick().await; + let data_dir_clone = data_dir.clone(); + let check_start = Instant::now(); + + let usage = tokio::task::spawn_blocking(move || { + get_filesystem_usage(data_dir_clone.as_std_path()) + }) + .await + .unwrap_or(0); + + let elapsed = check_start.elapsed().as_secs_f64(); + GLOBAL_DISK_UTIL_CHECK_SECONDS.observe(elapsed); + if usage > limit { + warn!( + "Global disk usage exceeded limit. Usage: {} bytes, limit: {} bytes", + usage, limit + ); + } + GLOBAL_DISK_LIMIT_EXCEEDED.store(usage > limit, Ordering::Relaxed); + } + }) + .map(|res| ("Global disk usage watcher".to_string(), res)); + tasks_handles.push(Box::pin(disk_usage_watch_handle)); + } + /* END_HADRON */ if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let wal_service_handle = current_thread_rt .as_ref() diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs index b41bf2c3da..8c6a912166 100644 --- a/safekeeper/src/hadron.rs +++ b/safekeeper/src/hadron.rs @@ -1,12 +1,17 @@ +use once_cell::sync::Lazy; use pem::Pem; use safekeeper_api::models::PullTimelineRequest; -use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, env::VarError, net::IpAddr, sync::Arc, sync::atomic::AtomicBool, + time::Duration, +}; use tokio::time::sleep; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, id::TenantTimelineId, ip_address}; +use utils::{backoff, critical_timeline, id::TenantTimelineId, ip_address}; + +use anyhow::{Result, anyhow}; -use anyhow::Result; use pageserver_api::controller_api::{ AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse, }; @@ -346,6 +351,70 @@ pub async fn hcc_pull_timelines( Ok(()) } +/// true if the last background scan found total usage > limit +pub static GLOBAL_DISK_LIMIT_EXCEEDED: Lazy = Lazy::new(|| AtomicBool::new(false)); + +/// Returns filesystem usage in bytes for the filesystem containing the given path. +// Need to suppress the clippy::unnecessary_cast warning because the casts on the block count and the +// block size are required on macOS (they are 32-bit integers on macOS, apparantly). +#[allow(clippy::unnecessary_cast)] +pub fn get_filesystem_usage(path: &std::path::Path) -> u64 { + // Allow overriding disk usage via failpoint for tests + fail::fail_point!("sk-global-disk-usage", |val| { + // val is Option; parse payload if present + val.and_then(|s| s.parse::().ok()).unwrap_or(0) + }); + + // Call statvfs(3) for filesystem usage + use nix::sys::statvfs::statvfs; + match statvfs(path) { + Ok(stat) => { + // fragment size (f_frsize) if non-zero else block size (f_bsize) + let frsize = stat.fragment_size(); + let blocksz = if frsize > 0 { + frsize + } else { + stat.block_size() + }; + // used blocks = total blocks - available blocks for unprivileged + let used_blocks = stat.blocks().saturating_sub(stat.blocks_available()); + used_blocks as u64 * blocksz as u64 + } + Err(e) => { + // The global disk usage watcher aren't associated with a tenant or timeline, so we just + // pass placeholder (all-zero) tenant and timeline IDs to the critical!() macro. + let placeholder_ttid = TenantTimelineId::empty(); + critical_timeline!( + placeholder_ttid.tenant_id, + placeholder_ttid.timeline_id, + "Global disk usage watcher failed to read filesystem usage: {:?}", + e + ); + 0 + } + } +} + +/// Returns the total capacity of the current working directory's filesystem in bytes. +#[allow(clippy::unnecessary_cast)] +pub fn get_filesystem_capacity(path: &std::path::Path) -> Result { + // Call statvfs(3) for filesystem stats + use nix::sys::statvfs::statvfs; + match statvfs(path) { + Ok(stat) => { + // fragment size (f_frsize) if non-zero else block size (f_bsize) + let frsize = stat.fragment_size(); + let blocksz = if frsize > 0 { + frsize + } else { + stat.block_size() + }; + Ok(stat.blocks() as u64 * blocksz as u64) + } + Err(e) => Err(anyhow!("Failed to read filesystem capacity: {:?}", e)), + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a0ee2facb5..c9d8e7d3b0 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -33,11 +33,13 @@ use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; +use crate::hadron::{get_filesystem_capacity, get_filesystem_usage}; use crate::safekeeper::TermLsn; use crate::timelines_global_map::DeleteOrExclude; use crate::{ GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, }; +use serde_json::json; /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { @@ -127,6 +129,21 @@ async fn utilization_handler(request: Request) -> Result, A json_response(StatusCode::OK, utilization) } +/// Returns filesystem capacity and current utilization for the safekeeper data directory. +async fn filesystem_usage_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + let conf = get_conf(&request); + let path = conf.workdir.as_std_path(); + let capacity = get_filesystem_capacity(path).map_err(ApiError::InternalServerError)?; + let usage = get_filesystem_usage(path); + let resp = json!({ + "data_dir": path, + "capacity_bytes": capacity, + "usage_bytes": usage, + }); + json_response(StatusCode::OK, resp) +} + /// List all (not deleted) timelines. /// Note: it is possible to do the same with debug_dump. async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -730,6 +747,11 @@ pub fn make_router( }) }) .get("/v1/utilization", |r| request_span(r, utilization_handler)) + /* BEGIN_HADRON */ + .get("/v1/debug/filesystem_usage", |r| { + request_span(r, filesystem_usage_handler) + }) + /* END_HADRON */ .delete("/v1/tenant/:tenant_id", |r| { request_span(r, tenant_delete_handler) }) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 02533b804d..c6f9cc29e5 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -50,6 +50,7 @@ pub mod wal_storage; pub mod test_utils; mod timelines_global_map; + use std::sync::Arc; pub use timelines_global_map::GlobalTimelines; @@ -83,6 +84,10 @@ pub mod defaults { pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; pub const DEFAULT_SSL_CERT_RELOAD_PERIOD: &str = "60s"; + + // Global disk watcher defaults + pub const DEFAULT_GLOBAL_DISK_CHECK_INTERVAL: &str = "60s"; + pub const DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO: f64 = 0.0; } #[derive(Debug, Clone)] @@ -116,6 +121,10 @@ pub struct SafeKeeperConf { /* BEGIN_HADRON */ pub max_reelect_offloader_lag_bytes: u64, pub max_timeline_disk_usage_bytes: u64, + /// How often to check the working directory's filesystem for total disk usage. + pub global_disk_check_interval: Duration, + /// The portion of the filesystem capacity that can be used by all timelines. + pub max_global_disk_usage_ratio: f64, /* END_HADRON */ pub backup_parallel_jobs: usize, pub wal_backup_enabled: bool, @@ -173,6 +182,8 @@ impl SafeKeeperConf { /* BEGIN_HADRON */ max_reelect_offloader_lag_bytes: defaults::DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, max_timeline_disk_usage_bytes: defaults::DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES, + global_disk_check_interval: Duration::from_secs(60), + max_global_disk_usage_ratio: defaults::DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO, /* END_HADRON */ current_thread_runtime: false, walsenders_keep_horizon: false, @@ -235,10 +246,13 @@ pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create WAL backup runtime") }); +/// Hadron: Dedicated runtime for infrequent background tasks. pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() - .thread_name("background worker") - .worker_threads(1) // there is only one task now (ssl certificate reloading), having more threads doesn't make sense + .thread_name("Hadron background worker") + // One worker thread is enough, as most of the actual tasks run on blocking threads + // which has it own thread pool. + .worker_threads(1) .enable_all() .build() .expect("Failed to create background runtime") diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index e1af51c115..b07852aaee 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -963,3 +963,17 @@ async fn collect_timeline_metrics(global_timelines: Arc) -> Vec } res } + +/* BEGIN_HADRON */ +// Metrics reporting the time spent to perform each safekeeper filesystem utilization check. +pub static GLOBAL_DISK_UTIL_CHECK_SECONDS: Lazy = Lazy::new(|| { + // Buckets from 1ms up to 10s + let buckets = vec![0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]; + register_histogram!( + "safekeeper_global_disk_utilization_check_seconds", + "Seconds spent to perform each safekeeper filesystem utilization check", + buckets + ) + .expect("Failed to register safekeeper_global_disk_utilization_check_seconds histogram") +}); +/* END_HADRON */ diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs index 72373b5786..0e697ade57 100644 --- a/safekeeper/src/rate_limit.rs +++ b/safekeeper/src/rate_limit.rs @@ -44,6 +44,6 @@ impl RateLimiter { /// Generate a random duration that is a fraction of the given duration. pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration { - let randf64 = rand::thread_rng().gen_range(0.0..1.0); + let randf64 = rand::rng().random_range(0.0..1.0); duration.mul_f64(randf64) } diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 72a436e25f..671798298b 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -742,7 +742,7 @@ mod tests { use std::str::FromStr; use std::time::Duration; - use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardIdentity}; use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion}; use tokio::sync::mpsc::error::TryRecvError; use utils::id::{NodeId, TenantTimelineId}; @@ -786,19 +786,13 @@ mod tests { MAX_SEND_SIZE, ); - let shard_0 = ShardIdentity::new( - ShardNumber(0), - ShardCount(SHARD_COUNT), - ShardStripeSize::default(), - ) - .unwrap(); + let shard_0 = + ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE) + .unwrap(); - let shard_1 = ShardIdentity::new( - ShardNumber(1), - ShardCount(SHARD_COUNT), - ShardStripeSize::default(), - ) - .unwrap(); + let shard_1 = + ShardIdentity::new(ShardNumber(1), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE) + .unwrap(); let mut shards = HashMap::new(); @@ -806,7 +800,7 @@ mod tests { let shard_id = ShardIdentity::new( ShardNumber(shard_number), ShardCount(SHARD_COUNT), - ShardStripeSize::default(), + DEFAULT_STRIPE_SIZE, ) .unwrap(); let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); @@ -934,12 +928,9 @@ mod tests { MAX_SEND_SIZE, ); - let shard_0 = ShardIdentity::new( - ShardNumber(0), - ShardCount(SHARD_COUNT), - ShardStripeSize::default(), - ) - .unwrap(); + let shard_0 = + ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE) + .unwrap(); struct Sender { tx: Option>, @@ -1088,19 +1079,13 @@ mod tests { WAL_READER_BATCH_SIZE, ); - let shard_0 = ShardIdentity::new( - ShardNumber(0), - ShardCount(SHARD_COUNT), - ShardStripeSize::default(), - ) - .unwrap(); + let shard_0 = + ShardIdentity::new(ShardNumber(0), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE) + .unwrap(); - let shard_1 = ShardIdentity::new( - ShardNumber(1), - ShardCount(SHARD_COUNT), - ShardStripeSize::default(), - ) - .unwrap(); + let shard_1 = + ShardIdentity::new(ShardNumber(1), ShardCount(SHARD_COUNT), DEFAULT_STRIPE_SIZE) + .unwrap(); let mut shards = HashMap::new(); @@ -1108,7 +1093,7 @@ mod tests { let shard_id = ShardIdentity::new( ShardNumber(shard_number), ShardCount(SHARD_COUNT), - ShardStripeSize::default(), + DEFAULT_STRIPE_SIZE, ) .unwrap(); let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 177e759db5..5891fa88a4 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -12,7 +12,8 @@ use futures::FutureExt; use itertools::Itertools; use parking_lot::Mutex; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; -use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, TimestampTz, get_current_timestamp}; +use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, get_current_timestamp}; +use postgres_ffi_types::TimestampTz; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use safekeeper_api::Term; use safekeeper_api::models::{ diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index dbe510a019..b8774b30ea 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -29,6 +29,8 @@ use utils::sync::gate::Gate; use crate::metrics::{ FullTimelineInfo, MISC_OPERATION_SECONDS, WAL_STORAGE_LIMIT_ERRORS, WalStorageMetrics, }; + +use crate::hadron::GLOBAL_DISK_LIMIT_EXCEEDED; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; @@ -425,6 +427,9 @@ impl From for ApiError { TimelineError::NotFound(ttid) => { ApiError::NotFound(anyhow!("timeline {} not found", ttid).into()) } + TimelineError::Deleted(ttid) => { + ApiError::NotFound(anyhow!("timeline {} deleted", ttid).into()) + } _ => ApiError::InternalServerError(anyhow!("{}", te)), } } @@ -1081,6 +1086,11 @@ impl WalResidentTimeline { ); } } + + if GLOBAL_DISK_LIMIT_EXCEEDED.load(Ordering::Relaxed) { + bail!("Global disk usage exceeded limit"); + } + Ok(()) } // END HADRON diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0e8dfd64c3..03c8f7e84a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -8,7 +8,7 @@ use std::time::Duration; use anyhow::{Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; -use futures::stream::FuturesOrdered; +use futures::stream::{self, FuturesOrdered}; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ @@ -723,8 +723,6 @@ pub async fn copy_s3_segments( from_segment: XLogSegNo, to_segment: XLogSegNo, ) -> Result<()> { - const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; - let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); @@ -744,27 +742,69 @@ pub async fn copy_s3_segments( .filter_map(|o| o.key.object_name().map(ToOwned::to_owned)) .collect::>(); - debug!( + info!( "these segments have already been uploaded: {:?}", uploaded_segments ); - for segno in from_segment..to_segment { - if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { - info!("copied all segments from {} until {}", from_segment, segno); - } + /* BEGIN_HADRON */ + // Copying multiple segments async. + let mut copy_stream = stream::iter(from_segment..to_segment) + .map(|segno| { + let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size); + let remote_dst_path = remote_dst_path.clone(); + let cancel = cancel.clone(); - let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size); - if uploaded_segments.contains(&segment_name) { - continue; - } - debug!("copying segment {}", segment_name); + async move { + if uploaded_segments.contains(&segment_name) { + return Ok(()); + } - let from = remote_timeline_path(src_ttid)?.join(&segment_name); - let to = remote_dst_path.join(&segment_name); + if segno % 1000 == 0 { + info!("copying segment {} {}", segno, segment_name); + } - storage.copy_object(&from, &to, &cancel).await?; + let from = remote_timeline_path(src_ttid)?.join(&segment_name); + let to = remote_dst_path.join(&segment_name); + + // Retry logic: retry up to 10 times with 1 second delay + let mut retry_count = 0; + const MAX_RETRIES: u32 = 10; + + loop { + match storage.copy_object(&from, &to, &cancel).await { + Ok(()) => return Ok(()), + Err(e) => { + if cancel.is_cancelled() { + // Don't retry if cancellation was requested + return Err(e); + } + + retry_count += 1; + if retry_count >= MAX_RETRIES { + error!( + "Failed to copy segment {} after {} retries: {}", + segment_name, MAX_RETRIES, e + ); + return Err(e); + } + warn!( + "Failed to copy segment {} (attempt {}/{}): {}, retrying...", + segment_name, retry_count, MAX_RETRIES, e + ); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + }) + .buffer_unordered(32); // Limit to 32 concurrent uploads + + // Process results, stopping on first error + while let Some(result) = copy_stream.next().await { + result?; } + /* END_HADRON */ info!( "finished copying segments from {} until {}", diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index e29b58836a..7e7d2390e9 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -16,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let mut config = TestConfig::new(Some(clock)); for _ in 0..500 { - let seed: u64 = rand::thread_rng().r#gen(); + let seed: u64 = rand::rng().random(); config.network = generate_network_opts(seed); let test = config.start(seed); diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 393df6228e..30d3ab1a87 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -195,6 +195,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { enable_pull_timeline_on_startup: false, advertise_pg_addr_tenant_only: None, hcc_base_url: None, + global_disk_check_interval: Duration::from_secs(10), + max_global_disk_usage_ratio: 0.0, /* END_HADRON */ }; diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index edd3bf2d9e..595cc7ab64 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -394,13 +394,13 @@ pub fn generate_schedule(seed: u64) -> Schedule { let mut schedule = Vec::new(); let mut time = 0; - let cnt = rng.gen_range(1..100); + let cnt = rng.random_range(1..100); for _ in 0..cnt { - time += rng.gen_range(0..500); - let action = match rng.gen_range(0..3) { - 0 => TestAction::WriteTx(rng.gen_range(1..10)), - 1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)), + time += rng.random_range(0..500); + let action = match rng.random_range(0..3) { + 0 => TestAction::WriteTx(rng.random_range(1..10)), + 1 => TestAction::RestartSafekeeper(rng.random_range(0..3)), 2 => TestAction::RestartWalProposer, _ => unreachable!(), }; @@ -413,13 +413,13 @@ pub fn generate_schedule(seed: u64) -> Schedule { pub fn generate_network_opts(seed: u64) -> NetworkOptions { let mut rng = rand::rngs::StdRng::seed_from_u64(seed); - let timeout = rng.gen_range(100..2000); - let max_delay = rng.gen_range(1..2 * timeout); - let min_delay = rng.gen_range(1..=max_delay); + let timeout = rng.random_range(100..2000); + let max_delay = rng.random_range(1..2 * timeout); + let min_delay = rng.random_range(1..=max_delay); - let max_fail_prob = rng.gen_range(0.0..0.9); - let connect_fail_prob = rng.gen_range(0.0..max_fail_prob); - let send_fail_prob = rng.gen_range(0.0..connect_fail_prob); + let max_fail_prob = rng.random_range(0.0..0.9); + let connect_fail_prob = rng.random_range(0.0..max_fail_prob); + let send_fail_prob = rng.random_range(0.0..connect_fail_prob); NetworkOptions { keepalive_timeout: Some(timeout), diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 143f4241f4..d67be6d469 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -52,6 +52,7 @@ tokio-rustls.workspace = true tokio-util.workspace = true tokio.workspace = true tracing.workspace = true +uuid.workspace = true measured.workspace = true rustls.workspace = true scopeguard.workspace = true @@ -63,6 +64,7 @@ tokio-postgres-rustls.workspace = true diesel = { version = "2.2.6", features = [ "serde_json", "chrono", + "uuid", ] } diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] } diesel_migrations = { version = "2.2.0" } diff --git a/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/down.sql b/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/down.sql new file mode 100644 index 0000000000..27d6048cd3 --- /dev/null +++ b/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/down.sql @@ -0,0 +1 @@ +ALTER TABLE timelines DROP sk_set_notified_generation; diff --git a/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/up.sql b/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/up.sql new file mode 100644 index 0000000000..50178ab6a3 --- /dev/null +++ b/storage_controller/migrations/2025-07-08-114340_sk_set_notified_generation/up.sql @@ -0,0 +1 @@ +ALTER TABLE timelines ADD sk_set_notified_generation INTEGER NOT NULL DEFAULT 1; diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql new file mode 100644 index 0000000000..b45b45e438 --- /dev/null +++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql @@ -0,0 +1,2 @@ +DROP TABLE hadron_safekeepers; +DROP TABLE hadron_timeline_safekeepers; diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql new file mode 100644 index 0000000000..6cee981efc --- /dev/null +++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql @@ -0,0 +1,17 @@ +-- hadron_safekeepers keep track of all Safe Keeper nodes that exist in the system. +-- Upon startup, each Safe Keeper reaches out to the hadron cluster coordinator to register its node ID and listen addresses. + +CREATE TABLE hadron_safekeepers ( + sk_node_id BIGINT PRIMARY KEY NOT NULL, + listen_http_addr VARCHAR NOT NULL, + listen_http_port INTEGER NOT NULL, + listen_pg_addr VARCHAR NOT NULL, + listen_pg_port INTEGER NOT NULL +); + +CREATE TABLE hadron_timeline_safekeepers ( + timeline_id VARCHAR NOT NULL, + sk_node_id BIGINT NOT NULL, + legacy_endpoint_id UUID DEFAULT NULL, + PRIMARY KEY(timeline_id, sk_node_id) +); diff --git a/storage_controller/src/auth.rs b/storage_controller/src/auth.rs index ef47abf8c7..8f15f0f072 100644 --- a/storage_controller/src/auth.rs +++ b/storage_controller/src/auth.rs @@ -1,4 +1,5 @@ use utils::auth::{AuthError, Claims, Scope}; +use uuid::Uuid; pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> { if claims.scope != required_scope { @@ -7,3 +8,14 @@ pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), Au Ok(()) } + +#[allow(dead_code)] +pub fn check_endpoint_permission(claims: &Claims, endpoint_id: Uuid) -> Result<(), AuthError> { + if claims.scope != Scope::TenantEndpoint { + return Err(AuthError("Scope mismatch. Permission denied".into())); + } + if claims.endpoint_id != Some(endpoint_id) { + return Err(AuthError("Endpoint id mismatch. Permission denied".into())); + } + Ok(()) +} diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index ab37a207e4..fb03412f3c 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -810,6 +810,7 @@ impl ComputeHook { let send_locked = tokio::select! { guard = send_lock.lock_owned() => {guard}, _ = cancel.cancelled() => { + tracing::info!("Notification cancelled while waiting for lock"); return Err(NotifyError::ShuttingDown) } }; @@ -851,11 +852,32 @@ impl ComputeHook { let notify_url = compute_hook_url.as_ref().unwrap(); self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local::(&request).await.map_err(|e| { + match self.do_notify_local::(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("neon_local notification hook failed: {e}"); - NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) - }) + if e.to_string().contains("refresh-configuration-pending") { + // If the error message mentions "refresh-configuration-pending", it means the compute node + // rejected our notification request because it already trying to reconfigure itself. We + // can proceed with the rest of the reconcliation process as the compute node already + // discovers the need to reconfigure and will eventually update its configuration once + // we update the pageserver mappings. In fact, it is important that we continue with + // reconcliation to make sure we update the pageserver mappings to unblock the compute node. + tracing::info!("neon_local notification hook failed: {e}"); + tracing::info!("Notification failed likely due to compute node self-reconfiguration, will retry."); + Ok(()) + } else { + tracing::error!("neon_local notification hook failed: {e}"); + Err(NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)) + } + }) { + // Compute node accepted the notification request. Ok to proceed. + Ok(_) => Ok(()), + // Compute node rejected our request but it is already self-reconfiguring. Ok to proceed. + Err(Ok(_)) => Ok(()), + // Fail the reconciliation attempt in all other cases. Recall that this whole code path involving + // neon_local is for testing only. In production we always retry failed reconcliations so we + // don't have any deadends here. + Err(Err(e)) => Err(e), + } }; match result { diff --git a/storage_controller/src/hadron_utils.rs b/storage_controller/src/hadron_utils.rs new file mode 100644 index 0000000000..8bfbe8e575 --- /dev/null +++ b/storage_controller/src/hadron_utils.rs @@ -0,0 +1,44 @@ +use std::collections::BTreeMap; + +use rand::Rng; +use utils::shard::TenantShardId; + +static CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()"; + +/// Generate a random string of `length` that can be used as a password. The generated string +/// contains alphanumeric characters and special characters (!@#$%^&*()) +pub fn generate_random_password(length: usize) -> String { + let mut rng = rand::rng(); + (0..length) + .map(|_| { + let idx = rng.random_range(0..CHARSET.len()); + CHARSET[idx] as char + }) + .collect() +} + +pub(crate) struct TenantShardSizeMap { + #[expect(dead_code)] + pub map: BTreeMap, +} + +impl TenantShardSizeMap { + pub fn new(map: BTreeMap) -> Self { + Self { map } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_generate_random_password() { + let pwd1 = generate_random_password(10); + assert_eq!(pwd1.len(), 10); + let pwd2 = generate_random_password(10); + assert_ne!(pwd1, pwd2); + assert!(pwd1.chars().all(|c| CHARSET.contains(&(c as u8)))); + assert!(pwd2.chars().all(|c| CHARSET.contains(&(c as u8)))); + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 62fc212e12..ff73719adb 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -48,7 +48,10 @@ use crate::metrics::{ }; use crate::persistence::SafekeeperUpsert; use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service}; +use crate::service::{ + LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service, + TenantMutationLocations, +}; /// State available to HTTP request handlers pub struct HttpState { @@ -734,83 +737,104 @@ async fn handle_tenant_timeline_passthrough( path ); - // Find the node that holds shard zero - let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() { - service + let tenant_shard_id = if tenant_or_shard_id.is_unsharded() { + // If the request contains only tenant ID, find the node that holds shard zero + let (_, shard_id) = service .tenant_shard0_node(tenant_or_shard_id.tenant_id) - .await? + .await?; + shard_id } else { - ( - service.tenant_shard_node(tenant_or_shard_id).await?, - tenant_or_shard_id, - ) + tenant_or_shard_id }; - // Callers will always pass an unsharded tenant ID. Before proxying, we must - // rewrite this to a shard-aware shard zero ID. - let path = format!("{path}"); - let tenant_str = tenant_or_shard_id.tenant_id.to_string(); - let tenant_shard_str = format!("{tenant_shard_id}"); - let path = path.replace(&tenant_str, &tenant_shard_str); + let service_inner = service.clone(); - let latency = &METRICS_REGISTRY - .metrics_group - .storage_controller_passthrough_request_latency; - - let path_label = path_without_ids(&path) - .split('/') - .filter(|token| !token.is_empty()) - .collect::>() - .join("_"); - let labels = PageserverRequestLabelGroup { - pageserver_id: &node.get_id().to_string(), - path: &path_label, - method: crate::metrics::Method::Get, - }; - - let _timer = latency.start_timer(labels.clone()); - - let client = mgmt_api::Client::new( - service.get_http_client().clone(), - node.base_url(), - service.get_config().pageserver_jwt_token.as_deref(), - ); - let resp = client.op_raw(method, path).await.map_err(|e| - // We return 503 here because if we can't successfully send a request to the pageserver, - // either we aren't available or the pageserver is unavailable. - ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?; - - if !resp.status().is_success() { - let error_counter = &METRICS_REGISTRY - .metrics_group - .storage_controller_passthrough_request_error; - error_counter.inc(labels); - } - - // Transform 404 into 503 if we raced with a migration - if resp.status() == reqwest::StatusCode::NOT_FOUND { - // Look up node again: if we migrated it will be different - let new_node = service.tenant_shard_node(tenant_shard_id).await?; - if new_node.get_id() != node.get_id() { - // Rather than retry here, send the client a 503 to prompt a retry: this matches - // the pageserver's use of 503, and all clients calling this API should retry on 503. - return Err(ApiError::ResourceUnavailable( - format!("Pageserver {node} returned 404, was migrated to {new_node}").into(), - )); + service.tenant_shard_remote_mutation(tenant_shard_id, |locations| async move { + let TenantMutationLocations(locations) = locations; + if locations.is_empty() { + return Err(ApiError::NotFound(anyhow::anyhow!("Tenant {} not found", tenant_or_shard_id.tenant_id).into())); } - } - // We have a reqest::Response, would like a http::Response - let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); - for (k, v) in resp.headers() { - builder = builder.header(k.as_str(), v.as_bytes()); - } + let (tenant_or_shard_id, locations) = locations.into_iter().next().unwrap(); + let node = locations.latest.node; - let response = builder - .body(Body::wrap_stream(resp.bytes_stream())) - .map_err(|e| ApiError::InternalServerError(e.into()))?; + // Callers will always pass an unsharded tenant ID. Before proxying, we must + // rewrite this to a shard-aware shard zero ID. + let path = format!("{path}"); + let tenant_str = tenant_or_shard_id.tenant_id.to_string(); + let tenant_shard_str = format!("{tenant_shard_id}"); + let path = path.replace(&tenant_str, &tenant_shard_str); - Ok(response) + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_latency; + + let path_label = path_without_ids(&path) + .split('/') + .filter(|token| !token.is_empty()) + .collect::>() + .join("_"); + let labels = PageserverRequestLabelGroup { + pageserver_id: &node.get_id().to_string(), + path: &path_label, + method: crate::metrics::Method::Get, + }; + + let _timer = latency.start_timer(labels.clone()); + + let client = mgmt_api::Client::new( + service_inner.get_http_client().clone(), + node.base_url(), + service_inner.get_config().pageserver_jwt_token.as_deref(), + ); + let resp = client.op_raw(method, path).await.map_err(|e| + // We return 503 here because if we can't successfully send a request to the pageserver, + // either we aren't available or the pageserver is unavailable. + ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?; + + if !resp.status().is_success() { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_error; + error_counter.inc(labels); + } + let resp_staus = resp.status(); + + // We have a reqest::Response, would like a http::Response + let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp_staus)?); + for (k, v) in resp.headers() { + builder = builder.header(k.as_str(), v.as_bytes()); + } + let resp_bytes = resp + .bytes() + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + // Inspect 404 errors: at this point, we know that the tenant exists, but the pageserver we route + // the request to might not yet be ready. Therefore, if it is a _tenant_ not found error, we can + // convert it into a 503. TODO: we should make this part of the check in `tenant_shard_remote_mutation`. + // However, `tenant_shard_remote_mutation` currently cannot inspect the HTTP error response body, + // so we have to do it here instead. + if resp_staus == reqwest::StatusCode::NOT_FOUND { + let resp_str = std::str::from_utf8(&resp_bytes) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + // We only handle "tenant not found" errors; other 404s like timeline not found should + // be forwarded as-is. + if Service::is_tenant_not_found_error(resp_str, tenant_or_shard_id.tenant_id) { + // Rather than retry here, send the client a 503 to prompt a retry: this matches + // the pageserver's use of 503, and all clients calling this API should retry on 503. + return Err(ApiError::ResourceUnavailable( + format!( + "Pageserver {node} returned tenant 404 due to ongoing migration, retry later" + ) + .into(), + )); + } + } + let response = builder + .body(Body::from(resp_bytes)) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) + }).await? } async fn handle_tenant_locate( @@ -1091,9 +1115,10 @@ async fn handle_node_delete(req: Request) -> Result, ApiErr let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; + let force: bool = parse_query_param(&req, "force")?.unwrap_or(false); json_response( StatusCode::OK, - state.service.start_node_delete(node_id).await?, + state.service.start_node_delete(node_id, force).await?, ) } @@ -2597,6 +2622,17 @@ pub fn make_router( ) }, ) + // Tenant timeline mark_invisible passthrough to shard zero + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_timeline_mark_invisible_passthrough"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -2615,17 +2651,6 @@ pub fn make_router( RequestName("v1_tenant_passthrough"), ) }) - // Tenant timeline mark_invisible passthrough to shard zero - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible", - |r| { - tenant_service_handler( - r, - handle_tenant_timeline_passthrough, - RequestName("v1_tenant_timeline_mark_invisible_passthrough"), - ) - }, - ) } #[cfg(test)] diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 36e3c5dc6c..24b06da83a 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -6,6 +6,7 @@ extern crate hyper0 as hyper; mod auth; mod background_node_operations; mod compute_hook; +pub mod hadron_utils; mod heartbeater; pub mod http; mod id_lock_map; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 5d21feeb10..34d4ac6fba 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -225,6 +225,10 @@ struct Cli { #[arg(long)] shard_split_request_timeout: Option, + + /// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss. + #[arg(long, default_value = "false")] + handle_ps_local_disk_loss: bool, } enum StrictMode { @@ -477,6 +481,7 @@ async fn async_main() -> anyhow::Result<()> { .shard_split_request_timeout .map(humantime::Duration::into) .unwrap_or(Duration::MAX), + handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, }; // Validate that we can connect to the database diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 8738386968..9c34b34044 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, - /// How many shards are keep-failing and will be ignored when considering to run optimizations - pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge, + /// How many shards are stuck and will be ignored when considering to run optimizations + pub(crate) storage_controller_stuck_reconciles: measured::Gauge, /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: @@ -151,6 +151,29 @@ pub(crate) struct StorageControllerMetricGroup { /// Indicator of completed safekeeper reconciles, broken down by safekeeper. pub(crate) storage_controller_safekeeper_reconciles_complete: measured::CounterVec, + + /* BEGIN HADRON */ + /// Hadron `config_watcher` reconciliation runs completed, broken down by success/failure. + pub(crate) storage_controller_config_watcher_complete: + measured::CounterVec, + + /// Hadron long waits for node state changes during drain and fill. + pub(crate) storage_controller_drain_and_fill_long_waits: measured::Counter, + + /// Set to 1 if we detect any page server pods with pending node pool rotation annotations. + /// Requires manual reset after oncall investigation. + pub(crate) storage_controller_ps_node_pool_rotation_pending: measured::Gauge, + + /// Hadron storage scrubber status. + pub(crate) storage_controller_storage_scrub_status: + measured::CounterVec, + + /// Desired number of pageservers managed by the storage controller + pub(crate) storage_controller_num_pageservers_desired: measured::Gauge, + + /// Desired number of safekeepers managed by the storage controller + pub(crate) storage_controller_num_safekeeper_desired: measured::Gauge, + /* END HADRON */ } impl StorageControllerMetrics { @@ -173,6 +196,10 @@ impl Default for StorageControllerMetrics { .storage_controller_reconcile_complete .init_all_dense(); + metrics_group + .storage_controller_config_watcher_complete + .init_all_dense(); + Self { metrics_group, encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), @@ -262,11 +289,48 @@ pub(crate) struct ReconcileLongRunningLabelGroup<'a> { pub(crate) sequence: &'a str, } +#[derive(measured::LabelGroup, Clone)] +#[label(set = StorageScrubberLabelGroupSet)] +pub(crate) struct StorageScrubberLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) tenant_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) shard_number: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) timeline_id: &'a str, + pub(crate) outcome: StorageScrubberOutcome, +} + +#[derive(FixedCardinalityLabel, Clone, Copy)] +pub(crate) enum StorageScrubberOutcome { + PSOk, + PSWarning, + PSError, + PSOrphan, + SKOk, + SKError, +} + +#[derive(measured::LabelGroup)] +#[label(set = ConfigWatcherCompleteLabelGroupSet)] +pub(crate) struct ConfigWatcherCompleteLabelGroup { + // Reuse the ReconcileOutcome from the SC's reconciliation metrics. + pub(crate) status: ReconcileOutcome, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { + // Successfully reconciled everything. #[label(rename = "ok")] Success, + // Used by tenant-shard reconciler only. Reconciled pageserver state successfully, + // but failed to delivery the compute notificiation. This error is typically transient + // but if its occurance keeps increasing, it should be investigated. + #[label(rename = "ok_no_notify")] + SuccessNoNotify, + // We failed to reconcile some state and the reconcilation will be retried. Error, + // Reconciliation was cancelled. Cancel, } diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 6642c72f3c..63c82b5682 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -51,6 +51,39 @@ pub(crate) struct Node { cancel: CancellationToken, } +#[allow(dead_code)] +const ONE_MILLION: i64 = 1000000; + +// Converts a pool ID to a large number that can be used to assign unique IDs to pods in StatefulSets. +/// For example, if pool_id is 1, then the pods have NodeIds 1000000, 1000001, 1000002, etc. +/// If pool_id is None, then the pods have NodeIds 0, 1, 2, etc. +#[allow(dead_code)] +pub fn transform_pool_id(pool_id: Option) -> i64 { + match pool_id { + Some(id) => (id as i64) * ONE_MILLION, + None => 0, + } +} + +#[allow(dead_code)] +pub fn get_pool_id_from_node_id(node_id: i64) -> i32 { + (node_id / ONE_MILLION) as i32 +} + +/// Example pod name: page-server-0-1, safe-keeper-1-0 +#[allow(dead_code)] +pub fn get_node_id_from_pod_name(pod_name: &str) -> anyhow::Result { + let parts: Vec<&str> = pod_name.split('-').collect(); + if parts.len() != 4 { + return Err(anyhow::anyhow!("Invalid pod name: {}", pod_name)); + } + let pool_id = parts[2].parse::()?; + let node_offset = parts[3].parse::()?; + let node_id = transform_pool_id(Some(pool_id)) + node_offset; + + Ok(NodeId(node_id as u64)) +} + /// When updating [`Node::availability`] we use this type to indicate to the caller /// whether/how they changed it. pub(crate) enum AvailabilityTransition { @@ -403,3 +436,25 @@ impl std::fmt::Debug for Node { write!(f, "{} ({})", self.id, self.listen_http_addr) } } + +#[cfg(test)] +mod tests { + use utils::id::NodeId; + + use crate::node::get_node_id_from_pod_name; + + #[test] + fn test_get_node_id_from_pod_name() { + let pod_name = "page-server-3-12"; + let node_id = get_node_id_from_pod_name(pod_name).unwrap(); + assert_eq!(node_id, NodeId(3000012)); + + let pod_name = "safe-keeper-1-0"; + let node_id = get_node_id_from_pod_name(pod_name).unwrap(); + assert_eq!(node_id, NodeId(1000000)); + + let pod_name = "invalid-pod-name"; + let result = get_node_id_from_pod_name(pod_name); + assert!(result.is_err()); + } +} diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index da0687895a..9e829e252d 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -14,6 +14,8 @@ use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use crate::hadron_utils::TenantShardSizeMap; + /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. #[derive(Debug, Clone)] @@ -86,6 +88,31 @@ impl PageserverClient { ) } + #[expect(dead_code)] + pub(crate) async fn tenant_timeline_compact( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + force_image_layer_creation: bool, + wait_until_done: bool, + ) -> Result<()> { + measured_request!( + "tenant_timeline_compact", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .tenant_timeline_compact( + tenant_shard_id, + timeline_id, + force_image_layer_creation, + true, + false, + wait_until_done, + ) + .await + ) + } + /* BEGIN_HADRON */ pub(crate) async fn tenant_timeline_describe( &self, @@ -101,6 +128,17 @@ impl PageserverClient { .await ) } + + #[expect(dead_code)] + pub(crate) async fn list_tenant_visible_size(&self) -> Result { + measured_request!( + "list_tenant_visible_size", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.list_tenant_visible_size().await + ) + .map(TenantShardSizeMap::new) + } /* END_HADRON */ pub(crate) async fn tenant_scan_remote_storage( @@ -365,6 +403,16 @@ impl PageserverClient { ) } + #[expect(dead_code)] + pub(crate) async fn reset_alert_gauges(&self) -> Result<()> { + measured_request!( + "reset_alert_gauges", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.reset_alert_gauges().await + ) + } + pub(crate) async fn wait_lsn( &self, tenant_shard_id: TenantShardId, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index ed9a268064..619b5f69b8 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -129,7 +129,10 @@ pub(crate) enum DatabaseOperation { UpdateLeader, SetPreferredAzs, InsertTimeline, + UpdateTimeline, UpdateTimelineMembership, + UpdateCplaneNotifiedGeneration, + UpdateSkSetNotifiedGeneration, GetTimeline, InsertTimelineReconcile, RemoveTimelineReconcile, @@ -1463,9 +1466,41 @@ impl Persistence { .await } + /// Update an already present timeline. + /// VERY UNSAFE FUNCTION: this overrides in-progress migrations. Don't use this unless neccessary. + pub(crate) async fn update_timeline_unsafe( + &self, + entry: TimelineUpdate, + ) -> DatabaseResult { + use crate::schema::timelines; + + let entry = &entry; + self.with_measured_conn(DatabaseOperation::UpdateTimeline, move |conn| { + Box::pin(async move { + let inserted_updated = diesel::update(timelines::table) + .filter(timelines::tenant_id.eq(&entry.tenant_id)) + .filter(timelines::timeline_id.eq(&entry.timeline_id)) + .set(entry) + .execute(conn) + .await?; + + match inserted_updated { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({inserted_updated})" + ))), + } + }) + }) + .await + } + /// Update timeline membership configuration in the database. /// Perform a compare-and-swap (CAS) operation on the timeline's generation. /// The `new_generation` must be the next (+1) generation after the one in the database. + /// Also inserts reconcile_requests to safekeeper_timeline_pending_ops table in the same + /// transaction. pub(crate) async fn update_timeline_membership( &self, tenant_id: TenantId, @@ -1473,8 +1508,11 @@ impl Persistence { new_generation: SafekeeperGeneration, sk_set: &[NodeId], new_sk_set: Option<&[NodeId]>, + reconcile_requests: &[TimelinePendingOpPersistence], ) -> DatabaseResult<()> { - use crate::schema::timelines::dsl; + use crate::schema::safekeeper_timeline_pending_ops as stpo; + use crate::schema::timelines; + use diesel::query_dsl::methods::FilterDsl; let prev_generation = new_generation.previous().unwrap(); @@ -1482,14 +1520,15 @@ impl Persistence { let timeline_id = &timeline_id; self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| { Box::pin(async move { - let updated = diesel::update(dsl::timelines) - .filter(dsl::tenant_id.eq(&tenant_id.to_string())) - .filter(dsl::timeline_id.eq(&timeline_id.to_string())) - .filter(dsl::generation.eq(prev_generation.into_inner() as i32)) + let updated = diesel::update(timelines::table) + .filter(timelines::tenant_id.eq(&tenant_id.to_string())) + .filter(timelines::timeline_id.eq(&timeline_id.to_string())) + .filter(timelines::generation.eq(prev_generation.into_inner() as i32)) .set(( - dsl::generation.eq(new_generation.into_inner() as i32), - dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::>()), - dsl::new_sk_set.eq(new_sk_set + timelines::generation.eq(new_generation.into_inner() as i32), + timelines::sk_set + .eq(sk_set.iter().map(|id| id.0 as i64).collect::>()), + timelines::new_sk_set.eq(new_sk_set .map(|set| set.iter().map(|id| id.0 as i64).collect::>())), )) .execute(conn) @@ -1499,20 +1538,123 @@ impl Persistence { 0 => { // TODO(diko): It makes sense to select the current generation // and include it in the error message for better debuggability. - Err(DatabaseError::Cas( + return Err(DatabaseError::Cas( "Failed to update membership configuration".to_string(), - )) + )); + } + 1 => {} + _ => { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))); + } + }; + + for req in reconcile_requests { + let inserted_updated = diesel::insert_into(stpo::table) + .values(req) + .on_conflict((stpo::tenant_id, stpo::timeline_id, stpo::sk_id)) + .do_update() + .set(req) + .filter(stpo::generation.lt(req.generation)) + .execute(conn) + .await?; + + if inserted_updated > 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({inserted_updated})" + ))); } - 1 => Ok(()), - _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({updated})" - ))), } + + Ok(()) }) }) .await } + /// Update the cplane notified generation for a timeline. + /// Perform a compare-and-swap (CAS) operation on the timeline's cplane notified generation. + /// The update will fail if the specified generation is less than the cplane notified generation + /// in the database. + pub(crate) async fn update_cplane_notified_generation( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + generation: SafekeeperGeneration, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn( + DatabaseOperation::UpdateCplaneNotifiedGeneration, + move |conn| { + Box::pin(async move { + let updated = diesel::update(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::cplane_notified_generation.le(generation.into_inner() as i32)) + .set(dsl::cplane_notified_generation.eq(generation.into_inner() as i32)) + .execute(conn) + .await?; + + match updated { + 0 => Err(DatabaseError::Cas( + "Failed to update cplane notified generation".to_string(), + )), + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))), + } + }) + }, + ) + .await + } + + /// Update the sk set notified generation for a timeline. + /// Perform a compare-and-swap (CAS) operation on the timeline's sk set notified generation. + /// The update will fail if the specified generation is less than the sk set notified generation + /// in the database. + pub(crate) async fn update_sk_set_notified_generation( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + generation: SafekeeperGeneration, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn( + DatabaseOperation::UpdateSkSetNotifiedGeneration, + move |conn| { + Box::pin(async move { + let updated = diesel::update(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::sk_set_notified_generation.le(generation.into_inner() as i32)) + .set(dsl::sk_set_notified_generation.eq(generation.into_inner() as i32)) + .execute(conn) + .await?; + + match updated { + 0 => Err(DatabaseError::Cas( + "Failed to update sk set notified generation".to_string(), + )), + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))), + } + }) + }, + ) + .await + } + /// Load timeline from db. Returns `None` if not present. pub(crate) async fn get_timeline( &self, @@ -2462,6 +2604,7 @@ pub(crate) struct TimelinePersistence { pub(crate) new_sk_set: Option>, pub(crate) cplane_notified_generation: i32, pub(crate) deleted_at: Option>, + pub(crate) sk_set_notified_generation: i32, } /// This is separate from [TimelinePersistence] only because postgres allows NULLs @@ -2480,6 +2623,7 @@ pub(crate) struct TimelineFromDb { pub(crate) new_sk_set: Option>>, pub(crate) cplane_notified_generation: i32, pub(crate) deleted_at: Option>, + pub(crate) sk_set_notified_generation: i32, } impl TimelineFromDb { @@ -2499,10 +2643,23 @@ impl TimelineFromDb { new_sk_set, cplane_notified_generation: self.cplane_notified_generation, deleted_at: self.deleted_at, + sk_set_notified_generation: self.sk_set_notified_generation, } } } +// This is separate from TimelinePersistence because we don't want to touch generation and deleted_at values for the update. +#[derive(AsChangeset)] +#[diesel(table_name = crate::schema::timelines)] +#[diesel(treat_none_as_null = true)] +pub(crate) struct TimelineUpdate { + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) start_lsn: LsnWrapper, + pub(crate) sk_set: Vec, + pub(crate) new_sk_set: Option>, +} + #[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] #[diesel(table_name = crate::schema::safekeeper_timeline_pending_ops)] pub(crate) struct TimelinePendingOpPersistence { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index a2fba0fa56..d1590ec75e 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -862,11 +862,11 @@ impl Reconciler { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { if refreshed { tracing::info!( - node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute."); + node_id=%node.get_id(), "[Attached] Observed configuration correct after refresh. Notifying compute."); self.compute_notify().await?; } else { // Nothing to do - tracing::info!(node_id=%node.get_id(), "Observed configuration already correct."); + tracing::info!(node_id=%node.get_id(), "[Attached] Observed configuration already correct."); } } observed => { @@ -945,17 +945,17 @@ impl Reconciler { match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do - tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration already correct.") } _ => { // Only try and configure secondary locations on nodes that are available. This // allows the reconciler to "succeed" while some secondaries are offline (e.g. after // a node failure, where the failed node will have a secondary intent) if node.is_available() { - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration requires update."); changes.push((node.clone(), wanted_conf)) } else { - tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable"); + tracing::info!(node_id=%node.get_id(), "[Secondary] Skipping configuration as secondary, node is unavailable"); self.observed .locations .insert(node.get_id(), ObservedStateLocation { conf: None }); @@ -1066,6 +1066,9 @@ impl Reconciler { } result } else { + tracing::info!( + "Compute notification is skipped because the tenant shard does not have an attached (primary) location" + ); Ok(()) } } diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index b86b4dfab1..23f002d32a 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -981,7 +981,7 @@ mod tests { use pageserver_api::models::utilization::test_utilization; use pageserver_api::shard::ShardIdentity; use utils::id::TenantId; - use utils::shard::{ShardCount, ShardNumber, TenantShardId}; + use utils::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use super::*; use crate::tenant_shard::IntentState; @@ -1337,7 +1337,7 @@ mod tests { let shard_identity = ShardIdentity::new( tenant_shard_id.shard_number, tenant_shard_id.shard_count, - pageserver_api::shard::ShardStripeSize(1), + ShardStripeSize(1), ) .unwrap(); let mut shard = TenantShard::new( @@ -1411,7 +1411,7 @@ mod tests { let shard_identity = ShardIdentity::new( tenant_shard_id.shard_number, tenant_shard_id.shard_count, - pageserver_api::shard::ShardStripeSize(1), + ShardStripeSize(1), ) .unwrap(); let mut shard = TenantShard::new( @@ -1573,7 +1573,7 @@ mod tests { let shard_identity = ShardIdentity::new( tenant_shard_id.shard_number, tenant_shard_id.shard_count, - pageserver_api::shard::ShardStripeSize(1), + ShardStripeSize(1), ) .unwrap(); // 1 attached and 1 secondary. diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 312f7e0b0e..def519c168 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -13,6 +13,24 @@ diesel::table! { } } +diesel::table! { + hadron_safekeepers (sk_node_id) { + sk_node_id -> Int8, + listen_http_addr -> Varchar, + listen_http_port -> Int4, + listen_pg_addr -> Varchar, + listen_pg_port -> Int4, + } +} + +diesel::table! { + hadron_timeline_safekeepers (timeline_id, sk_node_id) { + timeline_id -> Varchar, + sk_node_id -> Int8, + legacy_endpoint_id -> Nullable, + } +} + diesel::table! { metadata_health (tenant_id, shard_number, shard_count) { tenant_id -> Varchar, @@ -100,11 +118,14 @@ diesel::table! { new_sk_set -> Nullable>>, cplane_notified_generation -> Int4, deleted_at -> Nullable, + sk_set_notified_generation -> Int4, } } diesel::allow_tables_to_appear_in_same_query!( controllers, + hadron_safekeepers, + hadron_timeline_safekeepers, metadata_health, nodes, safekeeper_timeline_pending_ops, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 638cb410fa..8f5efe8ac4 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -211,9 +211,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; -// Number of consecutive reconciliation errors, occured for one shard, +// Number of consecutive reconciliations that have occurred for one shard, // after which the shard is ignored when considering to run optimizations. -const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5; +const MAX_CONSECUTIVE_RECONCILES: usize = 10; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -487,6 +487,9 @@ pub struct Config { /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None. pub shard_split_request_timeout: Duration, + + // Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss. + pub handle_ps_local_disk_loss: bool, } impl From for ApiError { @@ -698,47 +701,70 @@ pub(crate) enum ReconcileResultRequest { } #[derive(Clone)] -struct MutationLocation { - node: Node, - generation: Generation, +pub(crate) struct MutationLocation { + pub(crate) node: Node, + pub(crate) generation: Generation, } #[derive(Clone)] -struct ShardMutationLocations { - latest: MutationLocation, - other: Vec, +pub(crate) struct ShardMutationLocations { + pub(crate) latest: MutationLocation, + pub(crate) other: Vec, } #[derive(Default, Clone)] -struct TenantMutationLocations(BTreeMap); +pub(crate) struct TenantMutationLocations(pub BTreeMap); struct ReconcileAllResult { spawned_reconciles: usize, - keep_failing_reconciles: usize, + stuck_reconciles: usize, has_delayed_reconciles: bool, } impl ReconcileAllResult { fn new( spawned_reconciles: usize, - keep_failing_reconciles: usize, + stuck_reconciles: usize, has_delayed_reconciles: bool, ) -> Self { assert!( - spawned_reconciles >= keep_failing_reconciles, - "It is impossible to have more keep-failing reconciles than spawned reconciles" + spawned_reconciles >= stuck_reconciles, + "It is impossible to have less spawned reconciles than stuck reconciles" ); Self { spawned_reconciles, - keep_failing_reconciles, + stuck_reconciles, has_delayed_reconciles, } } /// We can run optimizations only if we don't have any delayed reconciles and - /// all spawned reconciles are also keep-failing reconciles. + /// all spawned reconciles are also stuck reconciles. fn can_run_optimizations(&self) -> bool { - !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles + !self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles + } +} + +enum TenantIdOrShardId { + TenantId(TenantId), + TenantShardId(TenantShardId), +} + +impl TenantIdOrShardId { + fn tenant_id(&self) -> TenantId { + match self { + TenantIdOrShardId::TenantId(tenant_id) => *tenant_id, + TenantIdOrShardId::TenantShardId(tenant_shard_id) => tenant_shard_id.tenant_id, + } + } + + fn matches(&self, tenant_shard_id: &TenantShardId) -> bool { + match self { + TenantIdOrShardId::TenantId(tenant_id) => tenant_shard_id.tenant_id == *tenant_id, + TenantIdOrShardId::TenantShardId(this_tenant_shard_id) => { + this_tenant_shard_id == tenant_shard_id + } + } } } @@ -1482,7 +1508,6 @@ impl Service { match result.result { Ok(()) => { - tenant.consecutive_errors_count = 0; tenant.apply_observed_deltas(deltas); tenant.waiter.advance(result.sequence); } @@ -1501,8 +1526,6 @@ impl Service { } } - tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1); - // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. tenant.set_last_error(result.sequence, e); @@ -1514,6 +1537,8 @@ impl Service { } } + tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1); + // If we just finished detaching all shards for a tenant, it might be time to drop it from memory. if tenant.policy == PlacementPolicy::Detached { // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us @@ -2366,6 +2391,33 @@ impl Service { tenants: Vec::new(), }; + // [Hadron] If the pageserver reports in the reattach message that it has an empty disk, it's possible that it just + // recovered from a local disk failure. The response of the reattach request will contain a list of tenants but it + // will not be honored by the pageserver in this case (disk failure). We should make sure we clear any observed + // locations of tenants attached to the node so that the reconciler will discover the discrpancy and reconfigure the + // missing tenants on the node properly. + if self.config.handle_ps_local_disk_loss && reattach_req.empty_local_disk.unwrap_or(false) { + tracing::info!( + "Pageserver {node_id} reports empty local disk, clearing observed locations referencing the pageserver for all tenants", + node_id = reattach_req.node_id + ); + let mut num_tenant_shards_affected = 0; + for (tenant_shard_id, shard) in tenants.iter_mut() { + if shard + .observed + .locations + .remove(&reattach_req.node_id) + .is_some() + { + tracing::info!("Cleared observed location for tenant shard {tenant_shard_id}"); + num_tenant_shards_affected += 1; + } + } + tracing::info!( + "Cleared observed locations for {num_tenant_shards_affected} tenant shards" + ); + } + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying // to call location_conf API with an old generation. Wait for cancellation to complete // before responding to this request. Requires well implemented CancellationToken logic @@ -4752,6 +4804,38 @@ impl Service { Ok(()) } + pub(crate) fn is_tenant_not_found_error(body: &str, tenant_id: TenantId) -> bool { + body.contains(&format!("tenant {tenant_id}")) + } + + fn process_result_and_passthrough_errors( + &self, + tenant_id: TenantId, + results: Vec<(Node, Result)>, + ) -> Result, ApiError> { + let mut processed_results: Vec<(Node, T)> = Vec::with_capacity(results.len()); + for (node, res) in results { + match res { + Ok(res) => processed_results.push((node, res)), + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, body)) + if Self::is_tenant_not_found_error(&body, tenant_id) => + { + // If there's a tenant not found, we are still in the process of attaching the tenant. + // Return 503 so that the client can retry. + return Err(ApiError::ResourceUnavailable( + format!( + "Timeline is not attached to the pageserver {} yet, please retry", + node.get_id() + ) + .into(), + )); + } + Err(e) => return Err(passthrough_api_error(&node, e)), + } + } + Ok(processed_results) + } + pub(crate) async fn tenant_timeline_lsn_lease( &self, tenant_id: TenantId, @@ -4765,91 +4849,48 @@ impl Service { ) .await; - let mut retry_if_not_attached = false; - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); + self.tenant_remote_mutation(tenant_id, |locations| async move { + if locations.0.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } - // If the request got an unsharded tenant id, then apply - // the operation to all shards. Otherwise, apply it to a specific shard. - let shards_range = TenantShardId::tenant_range(tenant_id); + let results = self + .tenant_for_shards_api( + locations + .0 + .iter() + .map(|(tenant_shard_id, ShardMutationLocations { latest, .. })| { + (*tenant_shard_id, latest.node.clone()) + }) + .collect(), + |tenant_shard_id, client| async move { + client + .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; - for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { - if let Some(node_id) = shard.intent.get_attached() { - let node = locked - .nodes - .get(node_id) - .expect("Pageservers may not be deleted while referenced"); - - targets.push((*tenant_shard_id, node.clone())); - - if let Some(location) = shard.observed.locations.get(node_id) { - if let Some(ref conf) = location.conf { - if conf.mode != LocationConfigMode::AttachedSingle - && conf.mode != LocationConfigMode::AttachedMulti - { - // If the shard is attached as secondary, we need to retry if 404. - retry_if_not_attached = true; - } - // If the shard is attached as primary, we should succeed. - } else { - // Location conf is not available yet, retry if 404. - retry_if_not_attached = true; - } - } else { - // The shard is not attached to the intended pageserver yet, retry if 404. - retry_if_not_attached = true; - } + let leases = self.process_result_and_passthrough_errors(tenant_id, results)?; + let mut valid_until = None; + for (_, lease) in leases { + if let Some(ref mut valid_until) = valid_until { + *valid_until = std::cmp::min(*valid_until, lease.valid_until); + } else { + valid_until = Some(lease.valid_until); } } - targets - }; - - let res = self - .tenant_for_shards_api( - targets, - |tenant_shard_id, client| async move { - client - .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn) - .await - }, - 1, - 1, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - - let mut valid_until = None; - for (node, r) in res { - match r { - Ok(lease) => { - if let Some(ref mut valid_until) = valid_until { - *valid_until = std::cmp::min(*valid_until, lease.valid_until); - } else { - valid_until = Some(lease.valid_until); - } - } - Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) - if retry_if_not_attached => - { - // This is expected if the attach is not finished yet. Return 503 so that the client can retry. - return Err(ApiError::ResourceUnavailable( - format!( - "Timeline is not attached to the pageserver {} yet, please retry", - node.get_id() - ) - .into(), - )); - } - Err(e) => { - return Err(passthrough_api_error(&node, e)); - } - } - } - Ok(LsnLease { - valid_until: valid_until.unwrap_or_else(SystemTime::now), + Ok(LsnLease { + valid_until: valid_until.unwrap_or_else(SystemTime::now), + }) }) + .await? } pub(crate) async fn tenant_timeline_download_heatmap_layers( @@ -4996,11 +5037,37 @@ impl Service { /// - Looks up the shards and the nodes where they were most recently attached /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this /// ensures that the remote operation acted on the most recent generation, and is therefore durable. - async fn tenant_remote_mutation( + pub(crate) async fn tenant_remote_mutation( &self, tenant_id: TenantId, op: O, ) -> Result + where + O: FnOnce(TenantMutationLocations) -> F, + F: std::future::Future, + { + self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantId(tenant_id), op) + .await + } + + pub(crate) async fn tenant_shard_remote_mutation( + &self, + tenant_shard_id: TenantShardId, + op: O, + ) -> Result + where + O: FnOnce(TenantMutationLocations) -> F, + F: std::future::Future, + { + self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantShardId(tenant_shard_id), op) + .await + } + + async fn tenant_remote_mutation_inner( + &self, + tenant_id_or_shard_id: TenantIdOrShardId, + op: O, + ) -> Result where O: FnOnce(TenantMutationLocations) -> F, F: std::future::Future, @@ -5012,7 +5079,13 @@ impl Service { // run concurrently with reconciliations, and it is not guaranteed that the node we find here // will still be the latest when we're done: we will check generations again at the end of // this function to handle that. - let generations = self.persistence.tenant_generations(tenant_id).await?; + let generations = self + .persistence + .tenant_generations(tenant_id_or_shard_id.tenant_id()) + .await? + .into_iter() + .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id)) + .collect::>(); if generations .iter() @@ -5026,9 +5099,14 @@ impl Service { // One or more shards has not been attached to a pageserver. Check if this is because it's configured // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) let locked = self.inner.read().unwrap(); - for (shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { + let tenant_shards = locked + .tenants + .range(TenantShardId::tenant_range( + tenant_id_or_shard_id.tenant_id(), + )) + .filter(|(shard_id, _)| tenant_id_or_shard_id.matches(shard_id)) + .collect::>(); + for (shard_id, shard) in tenant_shards { match shard.policy { PlacementPolicy::Attached(_) => { // This shard is meant to be attached: the caller is not wrong to try and @@ -5138,7 +5216,14 @@ impl Service { // Post-check: are all the generations of all the shards the same as they were initially? This proves that // our remote operation executed on the latest generation and is therefore persistent. { - let latest_generations = self.persistence.tenant_generations(tenant_id).await?; + let latest_generations = self + .persistence + .tenant_generations(tenant_id_or_shard_id.tenant_id()) + .await? + .into_iter() + .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id)) + .collect::>(); + if latest_generations .into_iter() .map( @@ -5267,6 +5352,8 @@ impl Service { status_code } /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. + /// + /// Returns the node, tenant shard id, and whether it is consistent with the observed state. pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, @@ -5293,6 +5380,8 @@ impl Service { /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound) + /// + /// Returns the intent node and whether it is consistent with the observed state. pub(crate) async fn tenant_shard_node( &self, tenant_shard_id: TenantShardId, @@ -5360,7 +5449,7 @@ impl Service { "Shard refers to nonexistent node" ))); }; - + // As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent. Ok(node.clone()) } @@ -7335,6 +7424,7 @@ impl Service { self: &Arc, node_id: NodeId, policy_on_start: NodeSchedulingPolicy, + force: bool, cancel: CancellationToken, ) -> Result<(), OperationError> { let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build(); @@ -7342,23 +7432,27 @@ impl Service { let mut waiters: Vec = Vec::new(); let mut tid_iter = create_shared_shard_iterator(self.clone()); + let reset_node_policy_on_cancel = || async { + match self + .node_configure(node_id, None, Some(policy_on_start)) + .await + { + Ok(()) => OperationError::Cancelled, + Err(err) => { + OperationError::FinalizeError( + format!( + "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}", + node_id, String::from(policy_on_start), err + ) + .into(), + ) + } + } + }; + while !tid_iter.finished() { if cancel.is_cancelled() { - match self - .node_configure(node_id, None, Some(policy_on_start)) - .await - { - Ok(()) => return Err(OperationError::Cancelled), - Err(err) => { - return Err(OperationError::FinalizeError( - format!( - "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}", - node_id, String::from(policy_on_start), err - ) - .into(), - )); - } - } + return Err(reset_node_policy_on_cancel().await); } operation_utils::validate_node_state( @@ -7427,8 +7521,18 @@ impl Service { nodes, reconciler_config, ); - if let Some(some) = waiter { - waiters.push(some); + + if force { + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_configured_reconcile_shard: + // that means any reconciles we spawned will know about the node we're deleting, + // enabling them to do live migrations if it's still online. + tenant_shard.observed.locations.remove(&node_id); + } else if let Some(waiter) = waiter { + waiters.push(waiter); } } } @@ -7442,21 +7546,7 @@ impl Service { while !waiters.is_empty() { if cancel.is_cancelled() { - match self - .node_configure(node_id, None, Some(policy_on_start)) - .await - { - Ok(()) => return Err(OperationError::Cancelled), - Err(err) => { - return Err(OperationError::FinalizeError( - format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}", - node_id, String::from(policy_on_start), err - ) - .into(), - )); - } - } + return Err(reset_node_policy_on_cancel().await); } tracing::info!("Awaiting {} pending delete reconciliations", waiters.len()); @@ -7466,6 +7556,12 @@ impl Service { .await; } + let pf = pausable_failpoint!("delete-node-after-reconciles-spawned", &cancel); + if pf.is_err() { + // An error from pausable_failpoint indicates the cancel token was triggered. + return Err(reset_node_policy_on_cancel().await); + } + self.persistence .set_tombstone(node_id) .await @@ -8061,6 +8157,7 @@ impl Service { pub(crate) async fn start_node_delete( self: &Arc, node_id: NodeId, + force: bool, ) -> Result<(), ApiError> { let (ongoing_op, node_policy, schedulable_nodes_count) = { let locked = self.inner.read().unwrap(); @@ -8130,7 +8227,7 @@ impl Service { tracing::info!("Delete background operation starting"); let res = service - .delete_node(node_id, policy_on_start, cancel) + .delete_node(node_id, policy_on_start, force, cancel) .await; match res { Ok(()) => { @@ -8582,7 +8679,7 @@ impl Service { // This function is an efficient place to update lazy statistics, since we are walking // all tenants. let mut pending_reconciles = 0; - let mut keep_failing_reconciles = 0; + let mut stuck_reconciles = 0; let mut az_violations = 0; // If we find any tenants to drop from memory, stash them to offload after @@ -8618,30 +8715,32 @@ impl Service { // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another one - let consecutive_errors_count = shard.consecutive_errors_count; if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() { spawned_reconciles += 1; - // Count shards that are keep-failing. We still want to reconcile them - // to avoid a situation where a shard is stuck. - // But we don't want to consider them when deciding to run optimizations. - if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS { + if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES { + // Count shards that are stuck, butwe still want to reconcile them. + // We don't want to consider them when deciding to run optimizations. tracing::warn!( tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), - "Shard reconciliation is keep-failing: {} errors", - consecutive_errors_count + "Shard reconciliation is stuck: {} consecutive launches", + shard.consecutive_reconciles_count ); - keep_failing_reconciles += 1; + stuck_reconciles += 1; + } + } else { + if shard.delayed_reconcile { + // Shard wanted to reconcile but for some reason couldn't. + pending_reconciles += 1; } - } else if shard.delayed_reconcile { - // Shard wanted to reconcile but for some reason couldn't. - pending_reconciles += 1; - } + // Reset the counter when we don't need to launch a reconcile. + shard.consecutive_reconciles_count = 0; + } // If this tenant is detached, try dropping it from memory. This is usually done // proactively in [`Self::process_results`], but we do it here to handle the edge // case where a reconcile completes while someone else is holding an op lock for the tenant. @@ -8677,14 +8776,10 @@ impl Service { metrics::METRICS_REGISTRY .metrics_group - .storage_controller_keep_failing_reconciles - .set(keep_failing_reconciles as i64); + .storage_controller_stuck_reconciles + .set(stuck_reconciles as i64); - ReconcileAllResult::new( - spawned_reconciles, - keep_failing_reconciles, - has_delayed_reconciles, - ) + ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles) } /// `optimize` in this context means identifying shards which have valid scheduled locations, but diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 4087de200a..0efeef4e80 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -3,8 +3,8 @@ use std::sync::Arc; use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; -use rand::seq::SliceRandom; -use rand::{Rng, thread_rng}; +use rand::Rng; +use rand::seq::{IndexedRandom, SliceRandom}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; use utils::shard::TenantShardId; @@ -72,7 +72,7 @@ impl ChaosInjector { let cron_interval = self.get_cron_interval_sleep_future(); let chaos_type = tokio::select! { _ = interval.tick() => { - if thread_rng().gen_bool(0.5) { + if rand::rng().random_bool(0.5) { ChaosEvent::MigrationsToSecondary } else { ChaosEvent::GracefulMigrationsAnywhere @@ -134,7 +134,7 @@ impl ChaosInjector { let Some(new_location) = shard .intent .get_secondary() - .choose(&mut thread_rng()) + .choose(&mut rand::rng()) .cloned() else { tracing::info!( @@ -190,7 +190,7 @@ impl ChaosInjector { // Pick our victims: use a hand-rolled loop rather than choose_multiple() because we want // to take the mutable refs from our candidates rather than ref'ing them. while !candidates.is_empty() && victims.len() < batch_size { - let i = thread_rng().gen_range(0..candidates.len()); + let i = rand::rng().random_range(0..candidates.len()); victims.push(candidates.swap_remove(i)); } @@ -210,7 +210,7 @@ impl ChaosInjector { }) .collect::>(); - let Some(victim_node) = candidate_nodes.choose(&mut thread_rng()) else { + let Some(victim_node) = candidate_nodes.choose(&mut rand::rng()) else { // This can happen if e.g. we are in a small region with only one pageserver per AZ. tracing::info!( "no candidate nodes found for migrating shard {tenant_shard_id} within its home AZ", @@ -264,7 +264,7 @@ impl ChaosInjector { out_of_home_az.len() ); - out_of_home_az.shuffle(&mut thread_rng()); + out_of_home_az.shuffle(&mut rand::rng()); victims.extend(out_of_home_az.into_iter().take(batch_size)); } else { tracing::info!( @@ -274,7 +274,7 @@ impl ChaosInjector { ); victims.extend(out_of_home_az); - in_home_az.shuffle(&mut thread_rng()); + in_home_az.shuffle(&mut rand::rng()); victims.extend(in_home_az.into_iter().take(batch_size - victims.len())); } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 7521d7bd86..bc77a1a6b8 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -10,6 +10,7 @@ use crate::id_lock_map::trace_shared_lock; use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, + TimelineUpdate, }; use crate::safekeeper::Safekeeper; use crate::safekeeper_client::SafekeeperClient; @@ -311,6 +312,7 @@ impl Service { new_sk_set: None, cplane_notified_generation: 0, deleted_at: None, + sk_set_notified_generation: 0, }; let inserted = self .persistence @@ -454,19 +456,34 @@ impl Service { let persistence = TimelinePersistence { tenant_id: req.tenant_id.to_string(), timeline_id: req.timeline_id.to_string(), - start_lsn: Lsn::INVALID.into(), + start_lsn: req.start_lsn.into(), generation: 1, sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(), new_sk_set: None, cplane_notified_generation: 1, deleted_at: None, + sk_set_notified_generation: 1, }; - let inserted = self.persistence.insert_timeline(persistence).await?; + let inserted = self + .persistence + .insert_timeline(persistence.clone()) + .await?; if inserted { tracing::info!("imported timeline into db"); - } else { - tracing::info!("didn't import timeline into db, as it is already present in db"); + return Ok(()); } + tracing::info!("timeline already present in db, updating"); + + let update = TimelineUpdate { + tenant_id: persistence.tenant_id, + timeline_id: persistence.timeline_id, + start_lsn: persistence.start_lsn, + sk_set: persistence.sk_set, + new_sk_set: persistence.new_sk_set, + }; + self.persistence.update_timeline_unsafe(update).await?; + tracing::info!("timeline updated"); + Ok(()) } @@ -879,17 +896,21 @@ impl Service { /// If min_position is not None, validates that majority of safekeepers /// reached at least min_position. /// + /// If update_notified_generation is set, also updates sk_set_notified_generation + /// in the timelines table. + /// /// Return responses from safekeepers in the input order. async fn tenant_timeline_set_membership_quorum( self: &Arc, tenant_id: TenantId, timeline_id: TimelineId, safekeepers: &[Safekeeper], - config: &membership::Configuration, + mconf: &membership::Configuration, min_position: Option<(Term, Lsn)>, + update_notified_generation: bool, ) -> Result>, ApiError> { let req = TimelineMembershipSwitchRequest { - mconf: config.clone(), + mconf: mconf.clone(), }; const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); @@ -930,28 +951,34 @@ impl Service { .await?; for res in results.iter().flatten() { - if res.current_conf.generation > config.generation { + if res.current_conf.generation > mconf.generation { // Antoher switch_membership raced us. return Err(ApiError::Conflict(format!( "received configuration with generation {} from safekeeper, but expected {}", - res.current_conf.generation, config.generation + res.current_conf.generation, mconf.generation ))); - } else if res.current_conf.generation < config.generation { + } else if res.current_conf.generation < mconf.generation { // Note: should never happen. // If we get a response, it should be at least the sent generation. tracing::error!( "received configuration with generation {} from safekeeper, but expected {}", res.current_conf.generation, - config.generation + mconf.generation ); return Err(ApiError::InternalServerError(anyhow::anyhow!( "received configuration with generation {} from safekeeper, but expected {}", res.current_conf.generation, - config.generation + mconf.generation ))); } } + if update_notified_generation { + self.persistence + .update_sk_set_notified_generation(tenant_id, timeline_id, mconf.generation) + .await?; + } + Ok(results) } @@ -1020,17 +1047,22 @@ impl Service { } /// Exclude a timeline from safekeepers in parallel with retries. - /// If an exclude request is unsuccessful, it will be added to - /// the reconciler, and after that the function will succeed. - async fn tenant_timeline_safekeeper_exclude( + /// + /// Assumes that the exclude requests are already persistent in the database. + /// + /// The function does best effort: if an exclude request is unsuccessful, + /// it will be added to the in-memory reconciler, and the function will succeed anyway. + /// + /// Might fail if there is error accessing the database. + async fn tenant_timeline_safekeeper_exclude_reconcile( self: &Arc, tenant_id: TenantId, timeline_id: TimelineId, safekeepers: &[Safekeeper], - config: &membership::Configuration, + mconf: &membership::Configuration, ) -> Result<(), ApiError> { let req = TimelineMembershipSwitchRequest { - mconf: config.clone(), + mconf: mconf.clone(), }; const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30); @@ -1048,25 +1080,32 @@ impl Service { let mut reconcile_requests = Vec::new(); - for (idx, res) in results.iter().enumerate() { - if res.is_err() { - let sk_id = safekeepers[idx].skp.id; - let pending_op = TimelinePendingOpPersistence { - tenant_id: tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - generation: config.generation.into_inner() as i32, - op_kind: SafekeeperTimelineOpKind::Exclude, - sk_id, - }; - tracing::info!("writing pending exclude op for sk id {sk_id}"); - self.persistence.insert_pending_op(pending_op).await?; + fail::fail_point!("sk-migration-step-9-mid-exclude", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-step-9-mid-exclude" + ))) + }); + for (idx, res) in results.iter().enumerate() { + let sk_id = safekeepers[idx].skp.id; + let generation = mconf.generation.into_inner(); + + if res.is_ok() { + self.persistence + .remove_pending_op( + tenant_id, + Some(timeline_id), + NodeId(sk_id as u64), + generation, + ) + .await?; + } else { let req = ScheduleRequest { safekeeper: Box::new(safekeepers[idx].clone()), host_list: Vec::new(), tenant_id, timeline_id: Some(timeline_id), - generation: config.generation.into_inner(), + generation, kind: SafekeeperTimelineOpKind::Exclude, }; reconcile_requests.push(req); @@ -1193,6 +1232,22 @@ impl Service { } // It it is the same new_sk_set, we can continue the migration (retry). } else { + let prev_finished = timeline.cplane_notified_generation == timeline.generation + && timeline.sk_set_notified_generation == timeline.generation; + + if !prev_finished { + // The previous migration is committed, but the finish step failed. + // Safekeepers/cplane might not know about the last membership configuration. + // Retry the finish step to ensure smooth migration. + self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline) + .await?; + } + + if cur_sk_set == new_sk_set { + tracing::info!("timeline is already at the desired safekeeper set"); + return Ok(()); + } + // 3. No active migration yet. // Increment current generation and put desired_set to new_sk_set. generation = generation.next(); @@ -1204,8 +1259,15 @@ impl Service { generation, &cur_sk_set, Some(&new_sk_set), + &[], ) .await?; + + fail::fail_point!("sk-migration-after-step-3", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-3" + ))) + }); } let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; @@ -1234,6 +1296,7 @@ impl Service { &cur_safekeepers, &joint_config, None, // no min position + true, // update notified generation ) .await?; @@ -1251,6 +1314,12 @@ impl Service { "safekeepers set membership updated", ); + fail::fail_point!("sk-migration-after-step-4", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-4" + ))) + }); + // 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet // by doing pull_timeline from the majority of the current set. @@ -1270,6 +1339,12 @@ impl Service { ) .await?; + fail::fail_point!("sk-migration-after-step-5", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-5" + ))) + }); + // 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough. // TODO(diko): do we need to bump timeline term? @@ -1285,9 +1360,16 @@ impl Service { &new_safekeepers, &joint_config, Some(sync_position), + false, // we're just waiting for sync position, don't update notified generation ) .await?; + fail::fail_point!("sk-migration-after-step-7", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-7" + ))) + }); + // 8. Create new_conf: Configuration incrementing joint_conf generation and // having new safekeeper set as sk_set and None new_sk_set. @@ -1299,45 +1381,55 @@ impl Service { new_members: None, }; - self.persistence - .update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None) - .await?; - - // TODO(diko): at this point we have already updated the timeline in the database, - // but we still need to notify safekeepers and cplane about the new configuration, - // and put delition of the timeline from the old safekeepers into the reconciler. - // Ideally it should be done atomically, but now it's not. - // Worst case: the timeline is not deleted from old safekeepers, - // the compute may require both quorums till the migration is retried and completed. - - self.tenant_timeline_set_membership_quorum( - tenant_id, - timeline_id, - &new_safekeepers, - &new_conf, - None, // no min position - ) - .await?; - let new_ids: HashSet = new_safekeepers.iter().map(|sk| sk.get_id()).collect(); let exclude_safekeepers = cur_safekeepers .into_iter() .filter(|sk| !new_ids.contains(&sk.get_id())) .collect::>(); - self.tenant_timeline_safekeeper_exclude( + let exclude_requests = exclude_safekeepers + .iter() + .map(|sk| TimelinePendingOpPersistence { + sk_id: sk.skp.id, + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + }) + .collect::>(); + + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &new_sk_set, + None, + &exclude_requests, + ) + .await?; + + fail::fail_point!("sk-migration-after-step-8", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-8" + ))) + }); + + // At this point we have already updated the timeline in the database, so the final + // membership configuration is commited and the migration is not abortable anymore. + // But safekeepers and cplane/compute still need to be notified about the new configuration. + // The [`Self::finish_safekeeper_migration`] does exactly that: notifies everyone about + // the new configuration and reconciles excluded safekeepers. + // If it fails, the safkeeper migration call should be retried. + + self.finish_safekeeper_migration( tenant_id, timeline_id, - &exclude_safekeepers, + &new_safekeepers, &new_conf, + &exclude_safekeepers, ) .await?; - // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers. - // This way the compute will stop talking to excluded safekeepers only after we stop requiring to - // collect a quorum from them. - self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf) - .await?; - Ok(()) } @@ -1381,6 +1473,130 @@ impl Service { ApiError::InternalServerError(anyhow::anyhow!( "failed to notify cplane about safekeeper membership change: {err}" )) - }) + })?; + + self.persistence + .update_cplane_notified_generation(tenant_id, timeline_id, mconf.generation) + .await?; + + Ok(()) + } + + /// Finish safekeeper migration. + /// + /// It is the last step of the safekeeper migration. + /// + /// Notifies safekeepers and cplane about the final membership configuration, + /// reconciles excluded safekeepers and updates *_notified_generation in the database. + async fn finish_safekeeper_migration( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + new_safekeepers: &[Safekeeper], + new_conf: &membership::Configuration, + exclude_safekeepers: &[Safekeeper], + ) -> Result<(), ApiError> { + // 9. Call PUT configuration on safekeepers from the new set, delivering them new_conf. + // Also try to exclude safekeepers and notify cplane about the membership change. + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + new_safekeepers, + new_conf, + None, // no min position + true, // update notified generation + ) + .await?; + + fail::fail_point!("sk-migration-step-9-after-set-membership", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-step-9-after-set-membership" + ))) + }); + + self.tenant_timeline_safekeeper_exclude_reconcile( + tenant_id, + timeline_id, + exclude_safekeepers, + new_conf, + ) + .await?; + + fail::fail_point!("sk-migration-step-9-after-exclude", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-step-9-after-exclude" + ))) + }); + + // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers. + // This way the compute will stop talking to excluded safekeepers only after we stop requiring to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, new_conf) + .await?; + + fail::fail_point!("sk-migration-after-step-9", |_| { + Err(ApiError::BadRequest(anyhow::anyhow!( + "failpoint sk-migration-after-step-9" + ))) + }); + + Ok(()) + } + + /// Same as [`Self::finish_safekeeper_migration`], but restores the migration state from the database. + /// It's used when the migration failed during the finish step and we need to retry it. + async fn finish_safekeeper_migration_retry( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + timeline: &TimelinePersistence, + ) -> Result<(), ApiError> { + if timeline.new_sk_set.is_some() { + // Logical error, should never happen. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "can't finish timeline migration for {tenant_id}/{timeline_id}: new_sk_set is not None" + ))); + } + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let cur_sk_member_set = + Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?; + + let mconf = membership::Configuration { + generation: SafekeeperGeneration::new(timeline.generation as u32), + members: cur_sk_member_set, + new_members: None, + }; + + // We might have failed between commiting reconciliation requests and adding them to the in-memory reconciler. + // Reload them from the database. + let pending_ops = self + .persistence + .list_pending_ops_for_timeline(tenant_id, timeline_id) + .await?; + + let mut exclude_sk_ids = Vec::new(); + + for op in pending_ops { + if op.op_kind == SafekeeperTimelineOpKind::Exclude + && op.generation == timeline.generation + { + exclude_sk_ids.push(op.sk_id); + } + } + + let exclude_safekeepers = self.get_safekeepers(&exclude_sk_ids)?; + + self.finish_safekeeper_migration( + tenant_id, + timeline_id, + &cur_safekeepers, + &mconf, + &exclude_safekeepers, + ) + .await?; + + Ok(()) } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 0bfca5385e..f60378470e 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -131,14 +131,16 @@ pub(crate) struct TenantShard { #[serde(serialize_with = "read_last_error")] pub(crate) last_error: std::sync::Arc>>>, - /// Number of consecutive reconciliation errors that have occurred for this shard. + /// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been + /// scheduled a reconciliation for this shard. /// - /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard - /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to - /// allowing optimizations to run even with some failing shards. + /// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be + /// ignored when deciding whether optimizations can run. This includes both successful and failed + /// reconciliations. /// - /// The counter is reset to 0 after a successful reconciliation. - pub(crate) consecutive_errors_count: usize, + /// Incremented in [`crate::service::Service::process_result`], and reset to 0 when + /// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard. + pub(crate) consecutive_reconciles_count: usize, /// If we have a pending compute notification that for some reason we weren't able to send, /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes @@ -603,7 +605,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), - consecutive_errors_count: 0, + consecutive_reconciles_count: 0, pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), preferred_node: None, @@ -1272,7 +1274,9 @@ impl TenantShard { } /// Return true if the optimization was really applied: it will not be applied if the optimization's - /// sequence is behind this tenant shard's + /// sequence is behind this tenant shard's or if the intent state proposed by the optimization + /// is not compatible with the current intent state. The later may happen when the background + /// reconcile loops runs concurrently with HTTP driven optimisations. pub(crate) fn apply_optimization( &mut self, scheduler: &mut Scheduler, @@ -1282,6 +1286,15 @@ impl TenantShard { return false; } + if !self.validate_optimization(&optimization) { + tracing::info!( + "Skipping optimization for {} because it does not match current intent: {:?}", + self.tenant_shard_id, + optimization, + ); + return false; + } + metrics::METRICS_REGISTRY .metrics_group .storage_controller_schedule_optimization @@ -1322,6 +1335,34 @@ impl TenantShard { true } + /// Check that the desired modifications to the intent state are compatible with + /// the current intent state + fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.attached == Some(old_attached_node_id) + && self.intent.secondary.contains(&new_attached_node_id) + } + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: _, + new_node_id, + }) => { + // It's legal to remove a secondary that is not present in the intent state + !self.intent.secondary.contains(&new_node_id) + } + ScheduleOptimizationAction::CreateSecondary(new_node_id) => { + !self.intent.secondary.contains(&new_node_id) + } + ScheduleOptimizationAction::RemoveSecondary(_) => { + // It's legal to remove a secondary that is not present in the intent state + true + } + } + } + /// When a shard has several secondary locations, we need to pick one in situations where /// we promote one of them to an attached location: /// - When draining a node for restart @@ -1570,7 +1611,13 @@ impl TenantShard { // Update result counter let outcome_label = match &result { - Ok(_) => ReconcileOutcome::Success, + Ok(_) => { + if reconciler.compute_notify_failure { + ReconcileOutcome::SuccessNoNotify + } else { + ReconcileOutcome::Success + } + } Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, Err(_) => ReconcileOutcome::Error, }; @@ -1869,7 +1916,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence::initial())), error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), - consecutive_errors_count: 0, + consecutive_reconciles_count: 0, pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 1d278095ce..c43445e89d 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.json() + def autoscaling_metrics(self): + res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics") + res.raise_for_status() + log.debug("raw compute metrics: %s", res.text) + return res.text + def prewarm_lfc_status(self) -> dict[str, str]: res = self.get(self.prewarm_url) res.raise_for_status() diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index a41856fb5a..c4dd9f5c2e 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]: # Some API calls not yet implemented. # You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305 +@final class NeonAPI: def __init__(self, neon_api_key: str, neon_api_base_url: str): self.__neon_api_key = neon_api_key @@ -171,7 +172,7 @@ class NeonAPI: protected: bool | None = None, archived: bool | None = None, init_source: str | None = None, - add_endpoint=True, + add_endpoint: bool = True, ) -> dict[str, Any]: data: dict[str, Any] = {} if add_endpoint: diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 1abd3396e4..5ad00d155e 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: @@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--instance-id={instance_id}") if base_port is not None: cmd.append(f"--base-port={base_port}") + if handle_ps_local_disk_loss is not None: + cmd.append( + f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}" + ) return self.raw_cli(cmd) def storage_controller_stop(self, immediate: bool, instance_id: int | None = None): @@ -503,6 +508,7 @@ class NeonLocalCli(AbstractNeonCli): pageserver_id: int | None = None, allow_multiple=False, update_catalog: bool = False, + privileged_role_name: str | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -534,6 +540,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--allow-multiple"]) if update_catalog: args.extend(["--update-catalog"]) + if privileged_role_name is not None: + args.extend(["--privileged-role-name", privileged_role_name]) res = self.raw_cli(args) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b9fff05c6c..88919fe888 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -728,7 +728,7 @@ class NeonEnvBuilder: # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical. - # So, from_repo_dir patches up the the storcon database. + # So, from_repo_dir patches up the storcon database. patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" assert not patch_script_path.exists() patch_script = "" @@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ) -> Self: assert not self.running - self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.env.neon_cli.storage_controller_start( + timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss + ) self.running = True return self @@ -2119,11 +2122,14 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) - def node_delete(self, node_id): + def node_delete(self, node_id, force: bool = False): log.info(f"node_delete({node_id})") + query = f"{self.api}/control/v1/node/{node_id}/delete" + if force: + query += "?force=true" self.request( "PUT", - f"{self.api}/control/v1/node/{node_id}/delete", + query, headers=self.headers(TokenScope.ADMIN), ) @@ -2835,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ) -> Self: assert instance_id is not None and base_port is not None - self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.env.neon_cli.storage_controller_start( + timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss + ) self.instances[instance_id] = {"running": True} self.running = True @@ -4118,6 +4127,294 @@ class NeonAuthBroker: self._popen.kill() +class NeonLocalProxy(LogUtils): + """ + An object managing a local_proxy instance for rest broker testing. + The local_proxy serves as a direct connection to VanillaPostgres. + """ + + def __init__( + self, + neon_binpath: Path, + test_output_dir: Path, + http_port: int, + metrics_port: int, + vanilla_pg: VanillaPostgres, + config_path: Path | None = None, + ): + self.neon_binpath = neon_binpath + self.test_output_dir = test_output_dir + self.http_port = http_port + self.metrics_port = metrics_port + self.vanilla_pg = vanilla_pg + self.config_path = config_path or (test_output_dir / "local_proxy.json") + self.host = "127.0.0.1" + self.running = False + self.logfile = test_output_dir / "local_proxy.log" + self._popen: subprocess.Popen[bytes] | None = None + super().__init__(logfile=self.logfile) + + def start(self) -> Self: + assert self._popen is None + assert not self.running + + # Ensure vanilla_pg is running + if not self.vanilla_pg.is_running(): + self.vanilla_pg.start() + + args = [ + str(self.neon_binpath / "local_proxy"), + "--http", + f"{self.host}:{self.http_port}", + "--metrics", + f"{self.host}:{self.metrics_port}", + "--postgres", + f"127.0.0.1:{self.vanilla_pg.default_options['port']}", + "--config-path", + str(self.config_path), + "--disable-pg-session-jwt", + ] + + logfile = open(self.logfile, "w") + self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile) + self.running = True + self._wait_until_ready() + return self + + def stop(self) -> Self: + if self._popen is not None and self.running: + self._popen.terminate() + try: + self._popen.wait(timeout=5) + except subprocess.TimeoutExpired: + log.warning("failed to gracefully terminate local_proxy; killing") + self._popen.kill() + self.running = False + return self + + def get_binary_version(self) -> str: + """Get the version string of the local_proxy binary""" + try: + result = subprocess.run( + [str(self.neon_binpath / "local_proxy"), "--version"], + capture_output=True, + text=True, + timeout=10, + ) + return result.stdout.strip() + except (subprocess.TimeoutExpired, subprocess.CalledProcessError): + return "" + + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) + def _wait_until_ready(self): + assert self._popen and self._popen.poll() is None, ( + "Local proxy exited unexpectedly. Check test log." + ) + requests.get(f"http://{self.host}:{self.http_port}/metrics") + + def get_metrics(self) -> str: + response = requests.get(f"http://{self.host}:{self.metrics_port}/metrics") + return response.text + + def assert_no_errors(self): + # Define allowed error patterns for local_proxy + allowed_errors = [ + # Add patterns as needed + ] + not_allowed = [ + "error", + "panic", + "failed", + ] + + for na in not_allowed: + if na not in allowed_errors: + assert not self.log_contains(na), f"Found disallowed error pattern: {na}" + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ): + self.stop() + + +class NeonRestBrokerProxy(LogUtils): + """ + An object managing a proxy instance configured as both auth broker and rest broker. + This is the main proxy binary with --is-auth-broker and --is-rest-broker flags. + """ + + def __init__( + self, + neon_binpath: Path, + test_output_dir: Path, + wss_port: int, + http_port: int, + mgmt_port: int, + config_path: Path | None = None, + ): + self.neon_binpath = neon_binpath + self.test_output_dir = test_output_dir + self.wss_port = wss_port + self.http_port = http_port + self.mgmt_port = mgmt_port + self.config_path = config_path or (test_output_dir / "rest_broker_proxy.json") + self.host = "127.0.0.1" + self.running = False + self.logfile = test_output_dir / "rest_broker_proxy.log" + self._popen: subprocess.Popen[Any] | None = None + + def start(self) -> Self: + if self.running: + return self + + # Generate self-signed TLS certificates + cert_path = self.test_output_dir / "server.crt" + key_path = self.test_output_dir / "server.key" + + if not cert_path.exists() or not key_path.exists(): + import subprocess + + log.info("Generating self-signed TLS certificate for rest broker") + subprocess.run( + [ + "openssl", + "req", + "-new", + "-x509", + "-days", + "365", + "-nodes", + "-text", + "-out", + str(cert_path), + "-keyout", + str(key_path), + "-subj", + "/CN=*.local.neon.build", + ], + check=True, + ) + + log.info( + f"Starting rest broker proxy on WSS port {self.wss_port}, HTTP port {self.http_port}" + ) + + cmd = [ + str(self.neon_binpath / "proxy"), + "-c", + str(cert_path), + "-k", + str(key_path), + "--is-auth-broker", + "true", + "--is-rest-broker", + "true", + "--wss", + f"{self.host}:{self.wss_port}", + "--http", + f"{self.host}:{self.http_port}", + "--mgmt", + f"{self.host}:{self.mgmt_port}", + "--auth-backend", + "local", + "--config-path", + str(self.config_path), + ] + + log.info(f"Starting rest broker proxy with command: {' '.join(cmd)}") + + with open(self.logfile, "w") as logfile: + self._popen = subprocess.Popen( + cmd, + stdout=logfile, + stderr=subprocess.STDOUT, + cwd=self.test_output_dir, + env={ + **os.environ, + "RUST_LOG": "info", + "LOGFMT": "text", + "OTEL_SDK_DISABLED": "true", + }, + ) + + self.running = True + self._wait_until_ready() + return self + + def stop(self) -> Self: + if not self.running: + return self + + log.info("Stopping rest broker proxy") + + if self._popen is not None: + self._popen.terminate() + try: + self._popen.wait(timeout=10) + except subprocess.TimeoutExpired: + log.warning("failed to gracefully terminate rest broker proxy; killing") + self._popen.kill() + + self.running = False + return self + + def get_binary_version(self) -> str: + cmd = [str(self.neon_binpath / "proxy"), "--version"] + res = subprocess.run(cmd, capture_output=True, text=True, check=True) + return res.stdout.strip() + + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) + def _wait_until_ready(self): + # Check if the WSS port is ready using a simple HTTPS request + # REST API is served on the WSS port with HTTPS + requests.get(f"https://{self.host}:{self.wss_port}/", timeout=1, verify=False) + # Any response (even error) means the server is up - we just need to connect + + def get_metrics(self) -> str: + # Metrics are still on the HTTP port + response = requests.get(f"http://{self.host}:{self.http_port}/metrics", timeout=5) + response.raise_for_status() + return response.text + + def assert_no_errors(self): + # Define allowed error patterns for rest broker proxy + allowed_errors = [ + "connection closed before message completed", + "connection reset by peer", + "broken pipe", + "client disconnected", + "Authentication failed", + "connection timed out", + "no connection available", + "Pool dropped", + ] + + with open(self.logfile) as f: + for line in f: + if "ERROR" in line or "FATAL" in line: + if not any(allowed in line for allowed in allowed_errors): + raise AssertionError( + f"Found error in rest broker proxy log: {line.strip()}" + ) + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ): + self.stop() + + @pytest.fixture(scope="function") def link_proxy( port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path @@ -4200,6 +4497,81 @@ def static_proxy( yield proxy +@pytest.fixture(scope="function") +def local_proxy( + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + neon_binpath: Path, + test_output_dir: Path, +) -> Iterator[NeonLocalProxy]: + """Local proxy that connects directly to vanilla postgres for rest broker testing.""" + + # Start vanilla_pg without database bootstrapping + vanilla_pg.start() + + http_port = port_distributor.get_port() + metrics_port = port_distributor.get_port() + + with NeonLocalProxy( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + http_port=http_port, + metrics_port=metrics_port, + vanilla_pg=vanilla_pg, + ) as proxy: + proxy.start() + yield proxy + + +@pytest.fixture(scope="function") +def local_proxy_fixed_port( + vanilla_pg: VanillaPostgres, + neon_binpath: Path, + test_output_dir: Path, +) -> Iterator[NeonLocalProxy]: + """Local proxy that connects directly to vanilla postgres on the hardcoded port 7432.""" + + # Start vanilla_pg without database bootstrapping + vanilla_pg.start() + + # Use the hardcoded port that the rest broker proxy expects + http_port = 7432 + metrics_port = 7433 # Use a different port for metrics + + with NeonLocalProxy( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + http_port=http_port, + metrics_port=metrics_port, + vanilla_pg=vanilla_pg, + ) as proxy: + proxy.start() + yield proxy + + +@pytest.fixture(scope="function") +def rest_broker_proxy( + port_distributor: PortDistributor, + neon_binpath: Path, + test_output_dir: Path, +) -> Iterator[NeonRestBrokerProxy]: + """Rest broker proxy that handles both auth broker and rest broker functionality.""" + + wss_port = port_distributor.get_port() + http_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + + with NeonRestBrokerProxy( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + wss_port=wss_port, + http_port=http_port, + mgmt_port=mgmt_port, + ) as proxy: + proxy.start() + yield proxy + + @pytest.fixture(scope="function") def neon_authorize_jwk() -> jwk.JWK: kid = str(uuid.uuid4()) @@ -4324,6 +4696,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id: int | None = None, allow_multiple: bool = False, update_catalog: bool = False, + privileged_role_name: str | None = None, ) -> Self: """ Create a new Postgres endpoint. @@ -4351,6 +4724,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, update_catalog=update_catalog, + privileged_role_name=privileged_role_name, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -4800,6 +5174,7 @@ class EndpointFactory: config_lines: list[str] | None = None, pageserver_id: int | None = None, update_catalog: bool = False, + privileged_role_name: str | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4823,6 +5198,7 @@ class EndpointFactory: config_lines=config_lines, pageserver_id=pageserver_id, update_catalog=update_catalog, + privileged_role_name=privileged_role_name, ) def stop_all(self, fail_on_error=True) -> Self: @@ -5417,6 +5793,7 @@ SKIP_FILES = frozenset( "postmaster.pid", "pg_control", "pg_dynshmem", + "neon-communicator.socket", ) ) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 59249f31ad..007f80ee5e 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -152,6 +152,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*reconciler.*neon_local error.*", # Tenant rate limits may fire in tests that submit lots of API requests. ".*tenant \\S+ is rate limited.*", + # Reconciliations may get stuck/delayed e.g. in chaos tests. + ".*background_reconcile: Shard reconciliation is stuck.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 23b9d1c8c9..f95b0ee4d1 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_lsn_lease( - self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs ): data = { "lsn": str(lsn), @@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease", json=data, + **kwargs, ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 0d7345cc82..1f80c2a290 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -741,3 +741,29 @@ def shared_buffers_for_max_cu(max_cu: float) -> str: sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024)) sharedBuffers = int(sharedBuffersMb * 1024 / 8) return str(sharedBuffers) + + +def skip_if_proxy_lacks_rest_broker(reason: str = "proxy was built without 'rest_broker' feature"): + # Determine the binary path using the same logic as neon_binpath fixture + def has_rest_broker_feature(): + # Find the neon binaries + if env_neon_bin := os.environ.get("NEON_BIN"): + binpath = Path(env_neon_bin) + else: + base_dir = Path(__file__).parents[2] # Same as BASE_DIR in paths.py + build_type = os.environ.get("BUILD_TYPE", "debug") + binpath = base_dir / "target" / build_type + + proxy_bin = binpath / "proxy" + if not proxy_bin.exists(): + return False + + try: + cmd = [str(proxy_bin), "--help"] + result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10) + help_output = result.stdout + return "--is-rest-broker" in help_output + except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError): + return False + + return pytest.mark.skipif(not has_rest_broker_feature(), reason=reason) diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index 0bb210db23..1b77831b75 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ".*Local notification hook failed.*", ".*Marking shard.*for notification retry.*", ".*Failed to notify compute.*", + # As an optimization, the storage controller kicks the downloads on the secondary + # after the shard split. However, secondaries are created async, so it's possible + # that the intent state was modified, but the actual secondary hasn't been created, + # which results in an error. + ".*Error calling secondary download after shard split.*", ] ) diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index b85ea16315..f8ca3f607f 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -711,6 +711,9 @@ def test_api_random( # To not go to the past where pgbench tables do not exist time.sleep(1) project.min_time = datetime.now(UTC) + # To not go to the past where pgbench tables do not exist + time.sleep(1) + project.min_time = datetime.now(UTC) for _ in range(num_operations): log.info("Starting action #%s", _ + 1) while not do_action( diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1209b3a818..0d92bf8406 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -24,10 +24,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): [ ".*get_values_reconstruct_data for layer .*", ".*could not find data for key.*", - ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", - ".*failed to load metadata.*", - ".*load failed.*load local timeline.*", ".*: layer load failed, assuming permanent failure:.*", ".*failed to get checkpoint bytes.*", ".*failed to get control bytes.*", diff --git a/test_runner/regress/test_communicator_metrics_exporter.py b/test_runner/regress/test_communicator_metrics_exporter.py new file mode 100644 index 0000000000..0e3e76910a --- /dev/null +++ b/test_runner/regress/test_communicator_metrics_exporter.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest +import requests +import requests_unixsocket # type: ignore [import-untyped] +from fixtures.metrics import parse_metrics + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + +NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket" + + +def test_communicator_metrics(neon_simple_env: NeonEnv): + """ + Test the communicator's built-in HTTP prometheus exporter + """ + env = neon_simple_env + + endpoint = env.endpoints.create("main") + endpoint.start() + + # Change current directory to the data directory, so that we can use + # a short relative path to refer to the socket. (There's a 100 char + # limitation on the path.) + os.chdir(str(endpoint.pgdata_dir)) + session = requests_unixsocket.Session() + r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics") + assert r.status_code == 200, f"got response {r.status_code}: {r.text}" + + # quick test that the endpoint returned something expected. (We don't validate + # that the metrics returned are sensible.) + m = parse_metrics(r.text) + m.query_one("lfc_hits") + m.query_one("lfc_misses") + + # Test panic handling. The /debug/panic endpoint raises a Rust panic. It's + # expected to unwind and drop the HTTP connection without response, but not + # kill the process or the server. + with pytest.raises( + requests.ConnectionError, match="Remote end closed connection without response" + ): + r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic") + assert r.status_code == 500 + + # Test that subsequent requests after the panic still work. + r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics") + assert r.status_code == 200, f"got response {r.status_code}: {r.text}" + m = parse_metrics(r.text) + m.query_one("lfc_hits") + m.query_one("lfc_misses") diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 963a19d640..76485c8321 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -687,7 +687,7 @@ def test_sharding_compaction( for _i in range(0, 10): # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, # these should result in image layers each time we write some data into a shard, and also shards - # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # receiving less data hitting their "empty image layer" path (where they should skip writing the layer, # rather than asserting) workload.churn_rows(64) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index a4d2bf8d9b..734887c5b3 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -187,19 +187,21 @@ def test_create_snapshot( env.pageserver.stop() env.storage_controller.stop() - # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it - compatibility_snapshot_dir = ( + # Directory `new_compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it + new_compatibility_snapshot_dir = ( top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" ) - if compatibility_snapshot_dir.exists(): - shutil.rmtree(compatibility_snapshot_dir) + if new_compatibility_snapshot_dir.exists(): + shutil.rmtree(new_compatibility_snapshot_dir) shutil.copytree( test_output_dir, - compatibility_snapshot_dir, - ignore=shutil.ignore_patterns("pg_dynshmem"), + new_compatibility_snapshot_dir, + ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"), ) + log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}") + # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" @@ -218,6 +220,7 @@ def test_backward_compatibility( """ Test that the new binaries can read old data """ + log.info(f"Using snapshot dir at {compatibility_snapshot_dir}") neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") env.pageserver.allowed_errors.append(ingest_lag_log_line) @@ -242,7 +245,6 @@ def test_forward_compatibility( test_output_dir: Path, top_output_dir: Path, pg_version: PgVersion, - compatibility_snapshot_dir: Path, compute_reconfigure_listener: ComputeReconfigure, ): """ @@ -266,8 +268,14 @@ def test_forward_compatibility( neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir + # Note that we are testing with new data, so we should use `new_compatibility_snapshot_dir`, which is created by test_create_snapshot. + new_compatibility_snapshot_dir = ( + top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" + ) + + log.info(f"Using snapshot dir at {new_compatibility_snapshot_dir}") env = neon_env_builder.from_repo_dir( - compatibility_snapshot_dir / "repo", + new_compatibility_snapshot_dir / "repo", ) # there may be an arbitrary number of unrelated tests run between create_snapshot and here env.pageserver.allowed_errors.append(ingest_lag_log_line) @@ -296,7 +304,7 @@ def test_forward_compatibility( check_neon_works( env, test_output_dir=test_output_dir, - sql_dump_path=compatibility_snapshot_dir / "dump.sql", + sql_dump_path=new_compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py index c6c192b6f1..6c1e3484fa 100644 --- a/test_runner/regress/test_feature_flag.py +++ b/test_runner/regress/test_feature_flag.py @@ -50,11 +50,15 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder): )["result"] ) + env.endpoints.create_start("main") # trigger basebackup env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant) # Check if the properties exist result = env.pageserver.http_client().evaluate_feature_flag_multivariate( env.initial_tenant, "test-feature-flag" ) + assert "tenant_remote_size_mb" in result["properties"] + assert "tenant_db_count_max" in result["properties"] + assert "tenant_rel_count_max" in result["properties"] assert "tenant_id" in result["properties"] diff --git a/test_runner/regress/test_hcc_handling_ps_data_loss.py b/test_runner/regress/test_hcc_handling_ps_data_loss.py new file mode 100644 index 0000000000..35d3b72923 --- /dev/null +++ b/test_runner/regress/test_hcc_handling_ps_data_loss.py @@ -0,0 +1,47 @@ +import shutil + +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import query_scalar + + +def test_hcc_handling_ps_data_loss( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that following a pageserver local data loss event, the system can recover automatically (i.e. + rehydrating the restarted pageserver from remote storage) without manual intervention. The + pageserver indicates to the storage controller that it has restarted without any local tenant + data in its "reattach" request and the storage controller uses this information to detect the + data loss condition and reconfigure the pageserver as necessary. + """ + env = neon_env_builder.init_configs() + env.broker.start() + env.storage_controller.start(handle_ps_local_disk_loss=True) + env.pageserver.start() + for sk in env.safekeepers: + sk.start() + + # create new nenant + tenant_id, _ = env.create_tenant(shard_count=4) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with endpoint.cursor() as cur: + cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')") + cur.execute("CREATE DATABASE testdb") + + with endpoint.cursor(dbname="testdb") as cur: + cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)") + endpoint.stop() + + # Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver + # that restarted with the same ID but has lost all its local disk data. + env.pageserver.stop(immediate=True) + shutil.rmtree(env.pageserver.tenant_dir()) + env.pageserver.start() + + # Test that the endpoint can start and query the database after the pageserver restarts. This + # indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote + # storage automatically. + endpoint.start() + with endpoint.cursor(dbname="testdb") as cur: + assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100 diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index a28bc3d047..2ee15b60fd 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.utils import USE_LFC, query_scalar if TYPE_CHECKING: @@ -75,10 +76,24 @@ WITH (fillfactor='100'); cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242") cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242") # verify working set size after some index access of a few select pages only - blocks = query_scalar(cur, "select approximate_working_set_size(true)") + blocks = query_scalar(cur, "select approximate_working_set_size(false)") log.info(f"working set size after some index access of a few select pages only {blocks}") assert blocks < 20 + # Also test the metrics from the /autoscaling_metrics endpoint + autoscaling_metrics = endpoint.http_client().autoscaling_metrics() + log.debug(f"Raw metrics: {autoscaling_metrics}") + m = parse_metrics(autoscaling_metrics) + + http_estimate = m.query_one( + "lfc_approximate_working_set_size_windows", + { + "duration_seconds": "60", + }, + ).value + log.info(f"http estimate: {http_estimate}, blocks: {blocks}") + assert http_estimate > 0 and http_estimate < 20 + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index f99d79e138..9a28f22e78 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -103,3 +103,90 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): query = "DROP SUBSCRIPTION sub CASCADE" log.info(f"Dropping subscription: {query}") cur.execute(query) + + +def test_privileged_role_override(neon_simple_env: NeonEnv, pg_version: PgVersion): + """ + Test that we can override the privileged role for an endpoint and when we do it, + everything is correctly bootstrapped inside Postgres and we don't have neon_superuser + role in the database. + """ + PRIVILEGED_ROLE_NAME = "my_superuser" + + env = neon_simple_env + env.create_branch("test_privileged_role_override") + ep = env.endpoints.create( + "test_privileged_role_override", + privileged_role_name=PRIVILEGED_ROLE_NAME, + update_catalog=True, + ) + + ep.start() + + ep.wait_for_migrations() + + member_roles = [ + "pg_read_all_data", + "pg_write_all_data", + "pg_monitor", + "pg_signal_backend", + ] + + non_member_roles = [ + "pg_execute_server_program", + "pg_read_server_files", + "pg_write_server_files", + ] + + role_attributes = { + "rolsuper": False, + "rolinherit": True, + "rolcreaterole": True, + "rolcreatedb": True, + "rolcanlogin": False, + "rolreplication": True, + "rolconnlimit": -1, + "rolbypassrls": True, + } + + if pg_version >= PgVersion.V15: + non_member_roles.append("pg_checkpoint") + + if pg_version >= PgVersion.V16: + member_roles.append("pg_create_subscription") + non_member_roles.append("pg_use_reserved_connections") + + with ep.cursor() as cur: + cur.execute(f"SELECT rolname FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'") + assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME + + cur.execute("SELECT rolname FROM pg_roles WHERE rolname = 'neon_superuser'") + assert len(cur.fetchall()) == 0 + + cur.execute("SHOW neon.privileged_role_name") + assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME + + # check PRIVILEGED_ROLE_NAME role is created + cur.execute(f"select * from pg_roles where rolname = '{PRIVILEGED_ROLE_NAME}'") + assert cur.fetchone() is not None + + # check PRIVILEGED_ROLE_NAME role has the correct member roles + for role in member_roles: + cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')") + assert cur.fetchone() == (True,), ( + f"Role {role} should be a member of {PRIVILEGED_ROLE_NAME}" + ) + + for role in non_member_roles: + cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')") + assert cur.fetchone() == (False,), ( + f"Role {role} should not be a member of {PRIVILEGED_ROLE_NAME}" + ) + + # check PRIVILEGED_ROLE_NAME role has the correct role attributes + for attr, val in role_attributes.items(): + cur.execute(f"SELECT {attr} FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'") + curr_val = cur.fetchone() + assert curr_val == (val,), ( + f"Role attribute {attr} should be {val} instead of {curr_val}" + ) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 91c4ef521c..68f470d962 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -246,9 +246,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): system_memory = psutil.virtual_memory().total - # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on - # a system with 128GB of RAM). We will then write enough data to violate this limit. - max_dirty_data = 128 * 1024 * 1024 + # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 256MB on + # a system with 256GB of RAM). We will then write enough data to violate this limit. + max_dirty_data = 256 * 1024 * 1024 ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory assert ephemeral_bytes_per_memory_kb > 0 @@ -272,7 +272,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): timeline_count = 10 # This is about 2MiB of data per timeline - entries_per_timeline = 100_000 + entries_per_timeline = 200_000 last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline)) wait_until_pageserver_is_caught_up(env, last_flush_lsns) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 728241b465..dd9c5437ad 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -3,6 +3,7 @@ # from __future__ import annotations +import time from concurrent.futures import ThreadPoolExecutor from typing import TYPE_CHECKING, Any, cast @@ -356,6 +357,81 @@ def test_sql_regress( post_checks(env, test_output_dir, DBNAME, endpoint) +def test_max_wal_rate(neon_simple_env: NeonEnv): + """ + Test the databricks.max_wal_mb_per_second GUC and how it affects WAL rate + limiting. + """ + env = neon_simple_env + + DBNAME = "regression" + superuser_name = "databricks_superuser" + + # Connect to postgres and create a database called "regression". + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # we need this option because default max_cluster_size < 0 will disable throttling completely + "neon.max_cluster_size=10GB", + ], + ) + + endpoint.safe_psql_many( + [ + f"CREATE ROLE {superuser_name}", + f"CREATE DATABASE {DBNAME}", + "CREATE EXTENSION neon", + ] + ) + + endpoint.safe_psql("CREATE TABLE usertable (YCSB_KEY INT, FIELD0 TEXT);", dbname=DBNAME) + + # Write ~1 MB data. + with endpoint.cursor(dbname=DBNAME) as cur: + for _ in range(0, 1000): + cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);") + + # No backpressure + tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();") + assert tuples[0][0] == 0, "Backpressure throttling detected" + + # 0 MB/s max_wal_rate. WAL proposer can still push some WALs but will be super slow. + endpoint.safe_psql_many( + [ + "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 0;", + "SELECT pg_reload_conf();", + ] + ) + + # Write ~10 KB data should hit backpressure. + with endpoint.cursor(dbname=DBNAME) as cur: + cur.execute("SET databricks.max_wal_mb_per_second = 0;") + for _ in range(0, 10): + cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);") + + tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();") + assert tuples[0][0] > 0, "No backpressure throttling detected" + + # 1 MB/s max_wal_rate. + endpoint.safe_psql_many( + [ + "ALTER SYSTEM SET databricks.max_wal_mb_per_second = 1;", + "SELECT pg_reload_conf();", + ] + ) + + # Write 10 MB data. + with endpoint.cursor(dbname=DBNAME) as cur: + start = int(time.time()) + for _ in range(0, 10000): + cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);") + + end = int(time.time()) + assert end - start >= 10, ( + "Throttling should cause the previous inserts to take greater than or equal to 10 seconds" + ) + + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_tx_abort_with_many_relations( diff --git a/test_runner/regress/test_rest_broker.py b/test_runner/regress/test_rest_broker.py new file mode 100644 index 0000000000..60b04655d3 --- /dev/null +++ b/test_runner/regress/test_rest_broker.py @@ -0,0 +1,137 @@ +import json +import signal +import time + +import requests +from fixtures.utils import skip_if_proxy_lacks_rest_broker +from jwcrypto import jwt + + +@skip_if_proxy_lacks_rest_broker() +def test_rest_broker_happy( + local_proxy_fixed_port, rest_broker_proxy, vanilla_pg, neon_authorize_jwk, httpserver +): + """Test REST API endpoint using local_proxy and rest_broker_proxy.""" + + # Use the fixed port local proxy + local_proxy = local_proxy_fixed_port + + # Create the required roles for PostgREST authentication + vanilla_pg.safe_psql("CREATE ROLE authenticator LOGIN") + vanilla_pg.safe_psql("CREATE ROLE authenticated") + vanilla_pg.safe_psql("CREATE ROLE anon") + vanilla_pg.safe_psql("GRANT authenticated TO authenticator") + vanilla_pg.safe_psql("GRANT anon TO authenticator") + + # Create the pgrst schema and configuration function required by the rest broker + vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS pgrst") + vanilla_pg.safe_psql(""" + CREATE OR REPLACE FUNCTION pgrst.pre_config() + RETURNS VOID AS $$ + SELECT + set_config('pgrst.db_schemas', 'test', true) + , set_config('pgrst.db_aggregates_enabled', 'true', true) + , set_config('pgrst.db_anon_role', 'anon', true) + , set_config('pgrst.jwt_aud', '', true) + , set_config('pgrst.jwt_secret', '', true) + , set_config('pgrst.jwt_role_claim_key', '."role"', true) + + $$ LANGUAGE SQL; + """) + vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA pgrst TO authenticator") + vanilla_pg.safe_psql("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA pgrst TO authenticator") + + # Bootstrap the database with test data + vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS test") + vanilla_pg.safe_psql(""" + CREATE TABLE IF NOT EXISTS test.items ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL + ) + """) + vanilla_pg.safe_psql("INSERT INTO test.items (name) VALUES ('test_item')") + + # Grant access to the test schema for the authenticated role + vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA test TO authenticated") + vanilla_pg.safe_psql("GRANT SELECT ON ALL TABLES IN SCHEMA test TO authenticated") + + # Set up HTTP server to serve JWKS (like static_auth_broker) + # Generate public key from the JWK + public_key = neon_authorize_jwk.export_public(as_dict=True) + + # Set up the httpserver to serve the JWKS + httpserver.expect_request("/.well-known/jwks.json").respond_with_json({"keys": [public_key]}) + + # Create JWKS configuration for the rest broker proxy + jwks_config = { + "jwks": [ + { + "id": "1", + "role_names": ["authenticator", "authenticated", "anon"], + "jwks_url": httpserver.url_for("/.well-known/jwks.json"), + "provider_name": "foo", + "jwt_audience": None, + } + ] + } + + # Write the JWKS config to the config file that rest_broker_proxy expects + config_file = rest_broker_proxy.config_path + with open(config_file, "w") as f: + json.dump(jwks_config, f) + + # Write the same config to the local_proxy config file + local_config_file = local_proxy.config_path + with open(local_config_file, "w") as f: + json.dump(jwks_config, f) + + # Signal both proxies to reload their config + if rest_broker_proxy._popen is not None: + rest_broker_proxy._popen.send_signal(signal.SIGHUP) + if local_proxy._popen is not None: + local_proxy._popen.send_signal(signal.SIGHUP) + # Wait a bit for config to reload + time.sleep(0.5) + + # Generate a proper JWT token using the JWK (similar to test_auth_broker.py) + token = jwt.JWT( + header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, + claims={ + "sub": "user", + "role": "authenticated", # role that's in role_names + "exp": 9999999999, # expires far in the future + "iat": 1000000000, # issued at + }, + ) + token.make_signed_token(neon_authorize_jwk) + + # Debug: Print the JWT claims and config for troubleshooting + print(f"JWT claims: {token.claims}") + print(f"JWT header: {token.header}") + print(f"Config file contains: {jwks_config}") + print(f"Public key kid: {public_key.get('kid')}") + + # Test REST API call - following SUBZERO.md pattern + # REST API is served on the WSS port with HTTPS and includes database name + # ep-purple-glitter-adqior4l-pooler.c-2.us-east-1.aws.neon.tech + url = f"https://foo.apirest.c-2.local.neon.build:{rest_broker_proxy.wss_port}/postgres/rest/v1/items" + + response = requests.get( + url, + headers={ + "Authorization": f"Bearer {token.serialize()}", + }, + params={"id": "eq.1", "select": "name"}, + verify=False, # Skip SSL verification for self-signed certs + ) + + print(f"Response status: {response.status_code}") + print(f"Response headers: {response.headers}") + print(f"Response body: {response.text}") + + # For now, let's just check that we get some response + # We can refine the assertions once we see what the actual response looks like + assert response.status_code in [200] # Any response means the proxies are working + + # check the response body + assert response.json() == [{"name": "test_item"}] diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 170c1a3650..371bec0c62 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -3,11 +3,22 @@ from __future__ import annotations from typing import TYPE_CHECKING import pytest +import requests +from fixtures.log_helper import log from fixtures.neon_fixtures import StorageControllerApiException if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder +# TODO(diko): pageserver spams with various errors during safekeeper migration. +# Fix the code so it handles the migration better. +ALLOWED_PAGESERVER_ERRORS = [ + ".*Timeline .* was cancelled and cannot be used anymore.*", + ".*Timeline .* has been deleted.*", + ".*Timeline .* was not found in global map.*", + ".*wal receiver task finished with an error.*", +] + def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): """ @@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): "timeline_safekeeper_count": 1, } env = neon_env_builder.init_start() - # TODO(diko): pageserver spams with various errors during safekeeper migration. - # Fix the code so it handles the migration better. - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was cancelled and cannot be used anymore.*", - ".*Timeline .* has been deleted.*", - ".*Timeline .* was not found in global map.*", - ".*wal receiver task finished with an error.*", - ] - ) + env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS) ep = env.endpoints.create("main", tenant_id=env.initial_tenant) @@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): assert len(mconf["sk_set"]) == 1 assert mconf["generation"] == 1 + current_sk = mconf["sk_set"][0] + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) ep.safe_psql("CREATE EXTENSION neon_test_utils;") ep.safe_psql("CREATE TABLE t(a int)") + expected_gen = 1 + for active_sk in range(1, 4): env.storage_controller.migrate_safekeepers( env.initial_tenant, env.initial_timeline, [active_sk] ) + if active_sk != current_sk: + expected_gen += 2 + current_sk = active_sk + other_sks = [sk for sk in range(1, 4) if sk != active_sk] for sk in other_sks: @@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] - # 1 initial generation + 2 migrations on each loop iteration. - expected_gen = 1 + 2 * 3 - mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) assert mconf["generation"] == expected_gen @@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder): env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned") expect_fail([sk_set[0], decom_sk], "decomissioned") + + +def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder): + """ + Test that safekeeper migration handles failures well. + + Two main conditions are checked: + 1. safekeeper migration handler can be retried on different failures. + 2. writes do not stuck if sk_set and new_sk_set have a quorum in common. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 3, + } + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert len(mconf["sk_set"]) == 3 + assert mconf["generation"] == 1 + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + + excluded_sk = mconf["sk_set"][-1] + added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0] + new_sk_set = mconf["sk_set"][:-1] + [added_sk] + log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}") + + failpoints = [ + "sk-migration-after-step-3", + "sk-migration-after-step-4", + "sk-migration-after-step-5", + "sk-migration-after-step-7", + "sk-migration-after-step-8", + "sk-migration-step-9-after-set-membership", + "sk-migration-step-9-mid-exclude", + "sk-migration-step-9-after-exclude", + "sk-migration-after-step-9", + ] + + for i, fp in enumerate(failpoints): + env.storage_controller.configure_failpoints((fp, "return(1)")) + + with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, new_sk_set + ) + ep.safe_psql(f"INSERT INTO t VALUES ({i})") + + env.storage_controller.configure_failpoints((fp, "off")) + + # No failpoints, migration should succeed. + env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] is None + assert mconf["sk_set"] == new_sk_set + assert mconf["generation"] == 3 + + ep.clear_buffers() + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))] + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:") + + # Check that we didn't forget to remove the timeline on the excluded safekeeper. + with pytest.raises(requests.exceptions.HTTPError) as exc: + env.safekeepers[excluded_sk - 1].http_client().timeline_status( + env.initial_tenant, env.initial_timeline + ) + assert exc.value.response.status_code == 404 + assert ( + f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text + ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 5549105188..2252c098c7 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1810,6 +1810,8 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): "config_lines": [ # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", + # Hadron: Need to set max_cluster_size to some value to enable any backpressure at all. + "neon.max_cluster_size=1GB", ], # We need `neon` extension for calling backpressure functions, # this flag instructs `compute_ctl` to pre-install it. diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 10845ef02e..9986c1f24a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING import fixtures.utils import pytest from fixtures.auth_tokens import TokenScope -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_AZ_ID, @@ -47,6 +47,7 @@ from fixtures.utils import ( wait_until, ) from fixtures.workload import Workload +from requests.adapters import HTTPAdapter from urllib3 import Retry from werkzeug.wrappers.response import Response @@ -72,6 +73,12 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts +class DeletionAPIKind(Enum): + OLD = "old" + FORCE = "force" + GRACEFUL = "graceful" + + @pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) def test_storage_controller_smoke( neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination @@ -990,7 +997,7 @@ def test_storage_controller_compute_hook_retry( @run_only_on_default_postgres("postgres behavior is not relevant") -def test_storage_controller_compute_hook_keep_failing( +def test_storage_controller_compute_hook_stuck_reconciles( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address: ListenAddress, @@ -1040,7 +1047,7 @@ def test_storage_controller_compute_hook_keep_failing( env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*") - env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*") + env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*") env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"}) # Migrate all allowed tenant shards to the first alive pageserver @@ -1055,7 +1062,7 @@ def test_storage_controller_compute_hook_keep_failing( # Make some reconcile_all calls to trigger optimizations # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS - RECONCILE_COUNT = 12 + RECONCILE_COUNT = 20 for i in range(RECONCILE_COUNT): try: n = env.storage_controller.reconcile_all() @@ -1068,6 +1075,8 @@ def test_storage_controller_compute_hook_keep_failing( assert banned_descr["shards"][0]["is_pending_compute_notification"] is True time.sleep(2) + env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*") + # Check that the allowed tenant shards are optimized due to affinity rules locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"] not_optimized_shard_count = 0 @@ -2572,9 +2581,11 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("while_offline", [True, False]) +@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE]) def test_storage_controller_node_deletion( neon_env_builder: NeonEnvBuilder, while_offline: bool, + deletion_api: DeletionAPIKind, ): """ Test that deleting a node works & properly reschedules everything that was on the node. @@ -2598,6 +2609,8 @@ def test_storage_controller_node_deletion( assert env.storage_controller.reconcile_all() == 0 victim = env.pageservers[-1] + if deletion_api == DeletionAPIKind.FORCE and not while_offline: + victim.allowed_errors.append(".*request was dropped before completing.*") # The procedure a human would follow is: # 1. Mark pageserver scheduling=pause @@ -2621,7 +2634,12 @@ def test_storage_controller_node_deletion( wait_until(assert_shards_migrated) log.info(f"Deleting pageserver {victim.id}") - env.storage_controller.node_delete_old(victim.id) + if deletion_api == DeletionAPIKind.FORCE: + env.storage_controller.node_delete(victim.id, force=True) + elif deletion_api == DeletionAPIKind.OLD: + env.storage_controller.node_delete_old(victim.id) + else: + raise AssertionError(f"Invalid deletion API: {deletion_api}") if not while_offline: @@ -2634,7 +2652,15 @@ def test_storage_controller_node_deletion( wait_until(assert_victim_evacuated) # The node should be gone from the list API - assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + def assert_node_is_gone(): + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + + if deletion_api == DeletionAPIKind.FORCE: + wait_until(assert_node_is_gone) + elif deletion_api == DeletionAPIKind.OLD: + assert_node_is_gone() + else: + raise AssertionError(f"Invalid deletion API: {deletion_api}") # No tenants should refer to the node in their intent for tenant_id in tenant_ids: @@ -2656,7 +2682,11 @@ def test_storage_controller_node_deletion( env.storage_controller.consistency_check() -def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL]) +def test_storage_controller_node_delete_cancellation( + neon_env_builder: NeonEnvBuilder, + deletion_api: DeletionAPIKind, +): neon_env_builder.num_pageservers = 3 neon_env_builder.num_azs = 3 env = neon_env_builder.init_configs() @@ -2680,12 +2710,16 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu assert len(nodes) == 3 env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)")) + env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "pause")) ps_id_to_delete = env.pageservers[0].id env.storage_controller.warm_up_all_secondaries() + + assert deletion_api in [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL] + force = deletion_api == DeletionAPIKind.FORCE env.storage_controller.retryable_node_operation( - lambda ps_id: env.storage_controller.node_delete(ps_id), + lambda ps_id: env.storage_controller.node_delete(ps_id, force), ps_id_to_delete, max_attempts=3, backoff=2, @@ -2701,6 +2735,8 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu env.storage_controller.cancel_node_delete(ps_id_to_delete) + env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "off")) + env.storage_controller.poll_node_status( ps_id_to_delete, PageserverAvailability.ACTIVE, @@ -3252,7 +3288,10 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB wait_until(reconfigure_node_again) -def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE]) +def test_ps_unavailable_after_delete( + neon_env_builder: NeonEnvBuilder, deletion_api: DeletionAPIKind +): neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_start() @@ -3265,10 +3304,16 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder): assert_nodes_count(3) ps = env.pageservers[0] - env.storage_controller.node_delete_old(ps.id) - # After deletion, the node count must be reduced - assert_nodes_count(2) + if deletion_api == DeletionAPIKind.FORCE: + ps.allowed_errors.append(".*request was dropped before completing.*") + env.storage_controller.node_delete(ps.id, force=True) + wait_until(lambda: assert_nodes_count(2)) + elif deletion_api == DeletionAPIKind.OLD: + env.storage_controller.node_delete_old(ps.id) + assert_nodes_count(2) + else: + raise AssertionError(f"Invalid deletion API: {deletion_api}") # Running pageserver CLI init in a separate thread with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: @@ -4814,3 +4859,103 @@ def test_storage_controller_migrate_with_pageserver_restart( "shards": [{"node_id": int(secondary.id), "shard_number": 0}], "preferred_az": DEFAULT_AZ_ID, } + + +@run_only_on_default_postgres("PG version is not important for this test") +def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder): + """ + Ensures that the storage controller correctly forwards 404s and converts some of them + into 503s before forwarding to the client. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.num_azs = 2 + + env = neon_env_builder.init_start() + env.storage_controller.allowed_errors.append(".*Reconcile error.*") + env.storage_controller.allowed_errors.append(".*Timed out.*") + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + # 404s on tenants and timelines are forwarded as-is when reconciler is not running. + + # Access a non-existing timeline -> 404 + with pytest.raises(PageserverApiException) as e: + env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, TimelineId.generate() + ) + assert e.value.status_code == 404 + with pytest.raises(PageserverApiException) as e: + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, TimelineId.generate(), Lsn(0) + ) + assert e.value.status_code == 404 + + # Access a non-existing tenant when reconciler is not running -> 404 + with pytest.raises(PageserverApiException) as e: + env.storage_controller.pageserver_api().timeline_detail( + TenantId.generate(), env.initial_timeline + ) + assert e.value.status_code == 404 + with pytest.raises(PageserverApiException) as e: + env.storage_controller.pageserver_api().timeline_lsn_lease( + TenantId.generate(), env.initial_timeline, Lsn(0) + ) + assert e.value.status_code == 404 + + # Normal requests should succeed + detail = env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, env.initial_timeline + ) + last_record_lsn = Lsn(detail["last_record_lsn"]) + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, last_record_lsn + ) + + # Get into a situation where the intent state is not the same as the observed state. + describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + current_primary = describe["node_attached"] + current_secondary = describe["node_secondary"][0] + assert current_primary != current_secondary + + # Pause the reconciler so that the generation number won't be updated. + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "pause") + ) + + # Do the migration in another thread; the request will be dropped as we don't wait. + shard_zero = TenantShardId(env.initial_tenant, 0, 0) + concurrent.futures.ThreadPoolExecutor(max_workers=1).submit( + env.storage_controller.tenant_shard_migrate, + shard_zero, + current_secondary, + StorageControllerMigrationConfig(override_scheduler=True), + ) + # Not the best way to do this, we should wait until the migration gets started. + time.sleep(1) + placement = env.storage_controller.get_tenants_placement()[str(shard_zero)] + assert placement["observed"] != placement["intent"] + assert placement["observed"]["attached"] == current_primary + assert placement["intent"]["attached"] == current_secondary + + # Now we issue requests that would cause 404 again + retry_strategy = Retry(total=0) + adapter = HTTPAdapter(max_retries=retry_strategy) + + no_retry_api = env.storage_controller.pageserver_api() + no_retry_api.mount("http://", adapter) + no_retry_api.mount("https://", adapter) + + # As intent state != observed state, tenant not found error should return 503, + # so that the client can retry once we've successfully migrated. + with pytest.raises(PageserverApiException) as e: + no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate()) + assert e.value.status_code == 503, f"unexpected status code and error: {e.value}" + with pytest.raises(PageserverApiException) as e: + no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0)) + assert e.value.status_code == 503, f"unexpected status code and error: {e.value}" + + # Unblock reconcile operations + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "off") + ) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index c54dd8b38d..7f32f34d36 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -76,7 +76,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - """Tests tenants with and without wal acceptors""" tenant_1, _ = env.create_tenant() tenant_2, _ = env.create_tenant() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 22e6d2e1c3..c691087259 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): # Wait for the error message to appear in the compute log def error_logged(): - return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None + if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None: + raise Exception("Expected error message not found in compute log yet") wait_until(error_logged) log.info("Found expected error message in compute log, resuming.") @@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): cur.execute("select count(*) from t") # 2000 rows from first insert + 1000 from last insert assert cur.fetchone() == (3000,) + + +def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder): + """ + Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker + also works as expected. The test scenario: + 1. Create a timeline and endpoint. + 2. Mock high disk usage via failpoint + 3. Write data to the timeline so that disk usage exceeds the limit. + 4. Verify that the writes hang and the expected error message appears in the compute log. + 5. Mock low disk usage via failpoint + 6. Verify that the hanging writes unblock and we can continue to write as normal. + """ + neon_env_builder.num_safekeepers = 1 + remote_storage_kind = s3_storage() + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + + env.create_branch("test_global_disk_usage_limit") + endpoint = env.endpoints.create_start("test_global_disk_usage_limit") + + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t2(key int, value text)") + + for sk in env.safekeepers: + sk.stop().start( + extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"] + ) + + # Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical + # limits in the test environment. + for sk in env.safekeepers: + sk.http_client().configure_failpoints( + [("sk-global-disk-usage", "return(18446744073709551615)")] + ) + + # Wait until the global disk usage limit watcher trips the circuit breaker. + def error_logged_in_sk(): + for sk in env.safekeepers: + if sk.log_contains("Global disk usage exceeded limit") is None: + raise Exception("Expected error message not found in safekeeper log yet") + + wait_until(error_logged_in_sk) + + def run_hanging_insert_global(): + with closing(endpoint.connect()) as bg_conn: + with bg_conn.cursor() as bg_cur: + # This should generate more than 1KiB of WAL + bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'") + + bg_thread_global = threading.Thread(target=run_hanging_insert_global) + bg_thread_global.start() + + def error_logged_in_compute(): + if endpoint.log_contains("Global disk usage exceeded limit") is None: + raise Exception("Expected error message not found in compute log yet") + + wait_until(error_logged_in_compute) + log.info("Found the expected error message in compute log, resuming.") + + time.sleep(2) + assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!" + + # Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing. + # The SKs should resume accepting WAL writes without restarting. + for sk in env.safekeepers: + sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")]) + + bg_thread_global.join(timeout=120) + assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart" + log.info("Global hanging insert unblocked.") + + # Verify that we can continue to write as normal and we don't have obvious data corruption + # following the recovery. + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t2 select generate_series(2001,3000), 'payload'") + + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from t2") + assert cur.fetchone() == (3000,) diff --git a/test_runner/sql_regress/expected/neon-spgist.out b/test_runner/sql_regress/expected/neon-spgist.out new file mode 100644 index 0000000000..5982084109 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-spgist.out @@ -0,0 +1,9 @@ +-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error) +create table spgist_point_tbl(id int4, p point); +create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25); +insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g; +insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g; +insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; +vacuum spgist_point_tbl; +insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; +checkpoint; diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index 0ce9f0e28f..d724c750ff 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -9,5 +9,6 @@ test: neon-rel-truncate test: neon-clog test: neon-test-utils test: neon-vacuum-full -test: neon-event-triggers test: neon-subxacts +test: neon-spgist +test: neon-event-triggers diff --git a/test_runner/sql_regress/sql/neon-spgist.sql b/test_runner/sql_regress/sql/neon-spgist.sql new file mode 100644 index 0000000000..b26b692ff7 --- /dev/null +++ b/test_runner/sql_regress/sql/neon-spgist.sql @@ -0,0 +1,10 @@ +-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error) +create table spgist_point_tbl(id int4, p point); +create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25); +insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g; +insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g; +insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; + +vacuum spgist_point_tbl; +insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; +checkpoint; diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 8ce1f52303..c9f9fdd011 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221 +Subproject commit c9f9fdd0113b52c0bd535afdb09d3a543aeee25f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index afd46987f3..aaaeff2550 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit afd46987f3da50c9146a8aa59380052df0862c06 +Subproject commit aaaeff2550d5deba58847f112af9b98fa3a58b00 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index e08c8d5f15..9b9cb4b3e3 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb +Subproject commit 9b9cb4b3e33347aea8f61e606bb6569979516de5 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 353c725b0c..fa1788475e 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814 +Subproject commit fa1788475e3146cc9c7c6a1b74f48fd296898fcd diff --git a/vendor/revisions.json b/vendor/revisions.json index 992aa405b1..7212c9f7c7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "353c725b0c76cc82b15af21d8360d03391dc6814" + "fa1788475e3146cc9c7c6a1b74f48fd296898fcd" ], "v16": [ "16.9", - "e08c8d5f1576ca0487d14d154510499c5f12adfb" + "9b9cb4b3e33347aea8f61e606bb6569979516de5" ], "v15": [ "15.13", - "afd46987f3da50c9146a8aa59380052df0862c06" + "aaaeff2550d5deba58847f112af9b98fa3a58b00" ], "v14": [ "14.18", - "8ce1f52303aec29e098309347b57c01a1962e221" + "c9f9fdd0113b52c0bd535afdb09d3a543aeee25f" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index c61598cdf6..f5984d3ac3 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -74,7 +74,7 @@ once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } -rand = { version = "0.8", features = ["small_rng"] } +rand = { version = "0.9" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } @@ -93,6 +93,7 @@ spki = { version = "0.7", default-features = false, features = ["pem", "std"] } stable_deref_trait = { version = "1" } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } +thiserror = { version = "2" } tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] } tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } time = { version = "0.3", features = ["macros", "serde-well-known"] } @@ -101,13 +102,13 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin tokio-stream = { version = "0.1", features = ["net", "sync"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } +tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "server", "tls-native-roots", "tls-ring", "zstd"] } tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } url = { version = "2", features = ["serde"] } -uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }