diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md new file mode 100644 index 0000000000..3f32b80ca8 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -0,0 +1,10 @@ +## Describe your changes + +## Issue ticket number and link + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code. +- [ ] If it is a core feature, I have added thorough tests. +- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? +- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. + diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 11c7992444..1d1b8dbfa4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -27,6 +27,8 @@ storage: ansible_host: i-0c3e70929edb5d691 pageserver-1.us-east-2.aws.neon.build: ansible_host: i-0565a8b4008aa3f40 + pageserver-2.us-east-2.aws.neon.build: + ansible_host: i-01e31cdf7e970586a safekeepers: hosts: diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ae9c1f2e40..08304503c5 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.eu-west-1.aws.neon.build" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index a2f932e4fb..be0fc329c9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.cloud.stage.neon.tech" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 1138536e94..b7f712585b 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.us-east-2.aws.neon.build" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 4e4aff1f9e..e9e89aff7c 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index 94290a87e1..5366ba4ae5 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 1a4023708b..e71e457f13 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.us-east-2.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 2942d6a2aa..9afe94edd1 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.us-west-2.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml index c7143cd61a..8143f7e575 100644 --- a/.github/helm-values/production.proxy-scram.yaml +++ b/.github/helm-values/production.proxy-scram.yaml @@ -3,6 +3,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.cloud.neon.tech" sentryEnvironment: "production" + wssPort: 8443 podLabels: zenith_service: proxy-scram @@ -16,6 +17,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech' + httpsPort: 443 metrics: enabled: true diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 17c698482c..1bbba8e3fd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -111,6 +111,7 @@ jobs: # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers run: make postgres-headers -j$(nproc) + - name: Run cargo clippy run: ./run_clippy.sh @@ -126,6 +127,11 @@ jobs: cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + # https://github.com/EmbarkStudios/cargo-deny + - name: Check rust licenses/bans/advisories/sources + if: ${{ !cancelled() }} + run: cargo deny check + build-neon: runs-on: [ self-hosted, dev, x64 ] container: @@ -177,13 +183,12 @@ jobs: # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | + CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features testing" CARGO_FLAGS="--locked $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV @@ -789,6 +794,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old steps: - name: Checkout uses: actions/checkout@v3 @@ -834,7 +841,9 @@ jobs: shell: bash strategy: matrix: - target_region: [ us-east-2 ] + target_region: [ eu-west-1, us-east-2 ] + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -906,6 +915,8 @@ jobs: strategy: matrix: target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -945,6 +956,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old env: KUBECONFIG: .kubeconfig steps: @@ -970,8 +983,8 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker: name: deploy storage broker on old staging and old prod @@ -988,6 +1001,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old env: KUBECONFIG: .kubeconfig steps: @@ -1036,6 +1051,8 @@ jobs: target_cluster: dev-eu-west-1-zeta deploy_link_proxy: false deploy_legacy_scram_proxy: false + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1051,19 +1068,19 @@ jobs: - name: Re-deploy scram proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy link proxy if: matrix.deploy_link_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy legacy scram proxy if: matrix.deploy_legacy_scram_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker-dev-new: runs-on: [ self-hosted, dev, x64 ] @@ -1083,6 +1100,8 @@ jobs: target_cluster: dev-us-east-2-beta - target_region: eu-west-1 target_cluster: dev-eu-west-1-zeta + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1121,6 +1140,8 @@ jobs: target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 target_cluster: prod-ap-southeast-1-epsilon + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1136,7 +1157,7 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker-prod-new: runs-on: prod @@ -1160,6 +1181,8 @@ jobs: target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 target_cluster: prod-ap-southeast-1-epsilon + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 diff --git a/Cargo.lock b/Cargo.lock index f1348eeace..45f3486e70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,12 +75,6 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "arrayvec" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" - [[package]] name = "asn1-rs" version = "0.5.1" @@ -143,9 +137,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.60" +version = "0.1.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d1d8ab452a3936018a687b20e6f7cf5363d713b732b8884001317b0e48aa3" +checksum = "705339e0e4a9690e2908d2b3d049d85682cf19fbd5782494498fbf7003a6a282" dependencies = [ "proc-macro2", "quote", @@ -507,9 +501,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08b108ad2665fa3f6e6a517c3d80ec3e77d224c47d605167aefaa5d7ef97fa48" +checksum = "1304eab461cf02bd70b083ed8273388f9724c549b316ba3d1e213ce0e9e7fb7e" dependencies = [ "async-trait", "axum-core", @@ -536,9 +530,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79b8558f5a0581152dc94dcd289132a1d377494bdeafcd41869b3258e3e2ad92" +checksum = "f487e40dc9daee24d8a1779df88522f159a54a980f99cfbe43db0be0bd3444a8" dependencies = [ "async-trait", "bytes", @@ -578,6 +572,12 @@ version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" +[[package]] +name = "base64" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" + [[package]] name = "bincode" version = "1.3.3" @@ -642,12 +642,6 @@ version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" -[[package]] -name = "bytemuck" -version = "1.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f" - [[package]] name = "byteorder" version = "1.4.3" @@ -902,7 +896,7 @@ dependencies = [ "clap 4.0.32", "comfy-table", "git-version", - "nix 0.25.1", + "nix", "once_cell", "pageserver_api", "postgres", @@ -937,15 +931,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" -[[package]] -name = "cpp_demangle" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" -dependencies = [ - "cfg-if", -] - [[package]] name = "cpufeatures" version = "0.2.5" @@ -1069,7 +1054,7 @@ dependencies = [ "crossterm_winapi", "libc", "mio", - "parking_lot 0.12.1", + "parking_lot", "signal-hook", "signal-hook-mio", "winapi", @@ -1096,9 +1081,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" +checksum = "51d1075c37807dcf850c379432f0df05ba52cc30f279c5cfc43cc221ce7f8579" dependencies = [ "cc", "cxxbridge-flags", @@ -1108,9 +1093,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" +checksum = "5044281f61b27bc598f2f6647d480aed48d2bf52d6eb0b627d84c0361b17aa70" dependencies = [ "cc", "codespan-reporting", @@ -1123,15 +1108,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" +checksum = "61b50bc93ba22c27b0d31128d2d130a0a6b3d267ae27ef7e4fae2167dfe8781c" [[package]] name = "cxxbridge-macro" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" +checksum = "39e61fda7e62115119469c7b3591fd913ecca96fb766cfd3f2e2502ab7bc87a5" dependencies = [ "proc-macro2", "quote", @@ -1179,15 +1164,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb" -[[package]] -name = "debugid" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" -dependencies = [ - "uuid 0.8.2", -] - [[package]] name = "debugid" version = "0.8.0" @@ -1195,7 +1171,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" dependencies = [ "serde", - "uuid 1.2.2", + "uuid", ] [[package]] @@ -1318,19 +1294,7 @@ dependencies = [ "cfg-if", "libc", "redox_syscall", - "windows-sys 0.42.0", -] - -[[package]] -name = "findshlibs" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" -dependencies = [ - "cc", - "lazy_static", - "libc", - "winapi", + "windows-sys", ] [[package]] @@ -1345,21 +1309,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1534,9 +1483,9 @@ dependencies = [ [[package]] name = "glob" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" @@ -1761,16 +1710,16 @@ dependencies = [ ] [[package]] -name = "hyper-tls" -version = "0.5.0" +name = "hyper-tungstenite" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879" dependencies = [ - "bytes", "hyper", - "native-tls", + "pin-project", "tokio", - "tokio-native-tls", + "tokio-tungstenite", + "tungstenite", ] [[package]] @@ -1824,24 +1773,6 @@ dependencies = [ "serde", ] -[[package]] -name = "inferno" -version = "0.10.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" -dependencies = [ - "ahash", - "atty", - "indexmap", - "itoa", - "lazy_static", - "log", - "num-format", - "quick-xml", - "rgb", - "str_stack", -] - [[package]] name = "inotify" version = "0.9.6" @@ -1878,14 +1809,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" dependencies = [ "libc", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] name = "ipnet" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11b0d96e660696543b251e58030cf9787df56da39dab19ad60eae7353040917e" +checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" [[package]] name = "is-terminal" @@ -1896,7 +1827,7 @@ dependencies = [ "hermit-abi 0.2.6", "io-lifetimes", "rustix", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -2068,15 +1999,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memmap2" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" version = "0.6.5" @@ -2135,7 +2057,7 @@ dependencies = [ "libc", "log", "wasi", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -2144,37 +2066,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - -[[package]] -name = "nix" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" -dependencies = [ - "bitflags", - "cc", - "cfg-if", - "libc", - "memoffset 0.6.5", -] - [[package]] name = "nix" version = "0.25.1" @@ -2238,16 +2129,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-format" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" -dependencies = [ - "arrayvec", - "itoa", -] - [[package]] name = "num-integer" version = "0.1.45" @@ -2280,9 +2161,9 @@ dependencies = [ [[package]] name = "object" -version = "0.30.0" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239da7f290cfa979f43f85a8efeee9a8a76d0827c356d37f9d3d7254d6b537fb" +checksum = "8d864c91689fdc196779b98dba0aceac6118594c2df6ee5d943eb6a8df4d107a" dependencies = [ "memchr", ] @@ -2308,51 +2189,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "openssl" -version = "0.10.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b102428fd03bc5edf97f62620f7298614c45cedf287c271e7ed450bbaf83f2e1" -dependencies = [ - "bitflags", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" -[[package]] -name = "openssl-sys" -version = "0.9.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23bbbf7854cd45b83958ebe919f0e8e516793727652e27fda10a8384cfc790b7" -dependencies = [ - "autocfg", - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "os_info" version = "3.5.1" @@ -2403,7 +2245,7 @@ dependencies = [ "hyper", "itertools", "metrics", - "nix 0.25.1", + "nix", "num-traits", "once_cell", "pageserver_api", @@ -2413,7 +2255,6 @@ dependencies = [ "postgres-types", "postgres_connection", "postgres_ffi", - "pprof", "pq_proto", "rand", "regex", @@ -2428,12 +2269,12 @@ dependencies = [ "signal-hook", "storage_broker", "svg_fmt", - "tar", "tempfile", "tenant_size_model", "thiserror", "tokio", "tokio-postgres", + "tokio-tar", "tokio-util", "toml_edit", "tracing", @@ -2458,17 +2299,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.6", -] - [[package]] name = "parking_lot" version = "0.12.1" @@ -2476,21 +2306,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.5", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall", - "smallvec", - "winapi", + "parking_lot_core", ] [[package]] @@ -2503,7 +2319,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -2587,12 +2403,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pkg-config" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" - [[package]] name = "plotters" version = "0.3.4" @@ -2699,25 +2509,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "pprof" -version = "0.6.1" -source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9" -dependencies = [ - "backtrace", - "cfg-if", - "findshlibs", - "inferno", - "lazy_static", - "libc", - "log", - "nix 0.23.2", - "parking_lot 0.11.2", - "symbolic-demangle", - "tempfile", - "thiserror", -] - [[package]] name = "ppv-lite86" version = "0.2.17" @@ -2734,6 +2525,7 @@ dependencies = [ "postgres-protocol", "rand", "serde", + "thiserror", "tokio", "tracing", "workspace_hack", @@ -2741,9 +2533,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8992a85d8e93a28bdf76137db888d3874e3b230dee5ed8bebac4c9f7617773" +checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78" dependencies = [ "proc-macro2", "syn", @@ -2812,7 +2604,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "parking_lot 0.12.1", + "parking_lot", "procfs", "thiserror", ] @@ -2889,15 +2681,17 @@ dependencies = [ "hex", "hmac", "hyper", + "hyper-tungstenite", "itertools", "md5", "metrics", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "pin-project-lite", "pq_proto", "rand", "rcgen", + "regex", "reqwest", "routerify", "rstest", @@ -2909,6 +2703,7 @@ dependencies = [ "sha2", "socket2", "thiserror", + "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -2917,20 +2712,12 @@ dependencies = [ "tracing-subscriber", "url", "utils", - "uuid 1.2.2", + "uuid", + "webpki-roots", "workspace_hack", "x509-parser", ] -[[package]] -name = "quick-xml" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" -dependencies = [ - "memchr", -] - [[package]] name = "quote" version = "1.0.23" @@ -3025,9 +2812,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" dependencies = [ "aho-corasick", "memchr", @@ -3098,12 +2885,10 @@ dependencies = [ "http-body", "hyper", "hyper-rustls", - "hyper-tls", "ipnet", "js-sys", "log", "mime", - "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -3113,7 +2898,6 @@ dependencies = [ "serde_json", "serde_urlencoded", "tokio", - "tokio-native-tls", "tokio-rustls", "tower-service", "url", @@ -3124,15 +2908,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "rgb" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3" -dependencies = [ - "bytemuck", -] - [[package]] name = "ring" version = "0.16.20" @@ -3247,7 +3022,7 @@ dependencies = [ "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -3276,11 +3051,11 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" +checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.13.1", + "base64 0.21.0", ] [[package]] @@ -3322,9 +3097,9 @@ dependencies = [ "humantime", "hyper", "metrics", - "nix 0.25.1", + "nix", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "postgres", "postgres-protocol", "postgres_ffi", @@ -3370,12 +3145,11 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.20" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" +checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3" dependencies = [ - "lazy_static", - "windows-sys 0.36.1", + "windows-sys", ] [[package]] @@ -3436,14 +3210,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc" dependencies = [ "httpdate", - "native-tls", "reqwest", + "rustls", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", + "webpki-roots", ] [[package]] @@ -3501,7 +3276,7 @@ version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6" dependencies = [ - "debugid 0.8.0", + "debugid", "getrandom", "hex", "serde", @@ -3509,7 +3284,7 @@ dependencies = [ "thiserror", "time", "url", - "uuid 1.2.2", + "uuid", ] [[package]] @@ -3557,9 +3332,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" +checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" dependencies = [ "base64 0.13.1", "chrono", @@ -3573,9 +3348,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" +checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" dependencies = [ "darling", "proc-macro2", @@ -3583,6 +3358,17 @@ dependencies = [ "syn", ] +[[package]] +name = "sha-1" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha1" version = "0.10.5" @@ -3737,7 +3523,7 @@ dependencies = [ "hyper", "metrics", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "prost", "tokio", "tokio-stream", @@ -3748,12 +3534,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "str_stack" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" - [[package]] name = "stringprep" version = "0.1.2" @@ -3801,29 +3581,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" -[[package]] -name = "symbolic-common" -version = "8.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" -dependencies = [ - "debugid 0.7.3", - "memmap2", - "stable_deref_trait", - "uuid 0.8.2", -] - -[[package]] -name = "symbolic-demangle" -version = "8.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" -dependencies = [ - "cpp_demangle", - "rustc-demangle", - "symbolic-common", -] - [[package]] name = "syn" version = "1.0.107" @@ -3982,10 +3739,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] -name = "tokio" -version = "1.21.1" +name = "tls-listener" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" +checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d" +dependencies = [ + "futures-util", + "hyper", + "pin-project-lite", + "thiserror", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "tokio" +version = "1.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae" dependencies = [ "autocfg", "bytes", @@ -3993,12 +3764,11 @@ dependencies = [ "memchr", "mio", "num_cpus", - "once_cell", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "winapi", + "windows-sys", ] [[package]] @@ -4022,16 +3792,6 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-native-tls" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" -dependencies = [ - "native-tls", - "tokio", -] - [[package]] name = "tokio-postgres" version = "0.7.7" @@ -4044,7 +3804,7 @@ dependencies = [ "futures-channel", "futures-util", "log", - "parking_lot 0.12.1", + "parking_lot", "percent-encoding", "phf", "pin-project-lite", @@ -4091,6 +3851,32 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-tar" +version = "0.3.0" +source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142" +dependencies = [ + "filetime", + "futures-core", + "libc", + "redox_syscall", + "tokio", + "tokio-stream", + "xattr", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite", +] + [[package]] name = "tokio-util" version = "0.7.4" @@ -4313,9 +4099,28 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + +[[package]] +name = "tungstenite" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0" +dependencies = [ + "base64 0.13.1", + "byteorder", + "bytes", + "http", + "httparse", + "log", + "rand", + "sha-1", + "thiserror", + "url", + "utf-8", +] [[package]] name = "typenum" @@ -4379,9 +4184,11 @@ checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566" dependencies = [ "base64 0.13.1", "log", - "native-tls", "once_cell", + "rustls", "url", + "webpki", + "webpki-roots", ] [[package]] @@ -4402,6 +4209,12 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utils" version = "0.1.0" @@ -4418,7 +4231,7 @@ dependencies = [ "hyper", "jsonwebtoken", "metrics", - "nix 0.25.1", + "nix", "once_cell", "pq_proto", "rand", @@ -4442,12 +4255,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" - [[package]] name = "uuid" version = "1.2.2" @@ -4464,12 +4271,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.4" @@ -4655,19 +4456,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows-sys" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" -dependencies = [ - "windows_aarch64_msvc 0.36.1", - "windows_i686_gnu 0.36.1", - "windows_i686_msvc 0.36.1", - "windows_x86_64_gnu 0.36.1", - "windows_x86_64_msvc 0.36.1", -] - [[package]] name = "windows-sys" version = "0.42.0" @@ -4675,12 +4463,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ "windows_aarch64_gnullvm", - "windows_aarch64_msvc 0.42.0", - "windows_i686_gnu 0.42.0", - "windows_i686_msvc 0.42.0", - "windows_x86_64_gnu 0.42.0", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", "windows_x86_64_gnullvm", - "windows_x86_64_msvc 0.42.0", + "windows_x86_64_msvc", ] [[package]] @@ -4689,48 +4477,24 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" -[[package]] -name = "windows_aarch64_msvc" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" - [[package]] name = "windows_aarch64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" -[[package]] -name = "windows_i686_gnu" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" - [[package]] name = "windows_i686_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" -[[package]] -name = "windows_i686_msvc" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" - [[package]] name = "windows_i686_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" -[[package]] -name = "windows_x86_64_gnu" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" - [[package]] name = "windows_x86_64_gnu" version = "0.42.0" @@ -4743,12 +4507,6 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" -[[package]] -name = "windows_x86_64_msvc" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" - [[package]] name = "windows_x86_64_msvc" version = "0.42.0" @@ -4768,7 +4526,6 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ - "ahash", "anyhow", "bytes", "chrono", @@ -4792,12 +4549,10 @@ dependencies = [ "rand", "regex", "regex-syntax", - "reqwest", "scopeguard", "serde", "serde_json", "socket2", - "stable_deref_trait", "syn", "tokio", "tokio-util", diff --git a/README.md b/README.md index 30bde949a9..7b629e71a5 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ - libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler + libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ + protobuf-devel ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Later that would be responsibility of a package install script > ./target/debug/neon_local init Starting pageserver at '127.0.0.1:64000' in '.neon'. -pageserver started, pid: 2545906 -Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9 -Stopped pageserver 1 process with pid 2545906 -# start pageserver and safekeeper +# start pageserver, safekeeper, and broker for their intercommunication > ./target/debug/neon_local start Starting neon broker at 127.0.0.1:50051 storage_broker started, pid: 2918372 @@ -130,6 +128,12 @@ pageserver started, pid: 2918386 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 +# create initial tenant and use it as a default for every future neon_local invocation +> ./target/debug/neon_local tenant create --set-default +tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver +Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c +Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one + # start postgres compute node > ./target/debug/neon_local pg start main Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ... diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c40d870649..4c65649610 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -2,6 +2,7 @@ name = "compute_tools" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 1588f5d62e..c871422e78 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) { let mut idle_backs: Vec> = vec![]; for b in backs.into_iter() { - let state: String = b.get("state"); - let change: String = b.get("state_change"); + let state: String = match b.try_get("state") { + Ok(state) => state, + Err(_) => continue, + }; if state == "idle" { + let change: String = match b.try_get("state_change") { + Ok(state_change) => state_change, + Err(_) => continue, + }; let change = DateTime::parse_from_rfc3339(&change); match change { Ok(t) => idle_backs.push(t.with_timezone(&Utc)), diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 58c94d74ae..81e01fe555 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,5 +1,6 @@ use std::path::Path; use std::str::FromStr; +use std::time::Instant; use anyhow::Result; use log::{info, log_enabled, warn, Level}; @@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Reassign all dependent objects and delete requested roles. pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { - let spec = &node.spec; - - // First, reassign all dependent objects to db owners. - if let Some(ops) = &spec.delta_operations { + if let Some(ops) = &node.spec.delta_operations { + // First, reassign all dependent objects to db owners. info!("reassigning dependent objects of to-be-deleted roles"); for op in ops { if op.action == "delete_role" { reassign_owned_objects(node, &op.name)?; } } - } - // Second, proceed with role deletions. - let mut xact = client.transaction()?; - if let Some(ops) = &spec.delta_operations { + // Second, proceed with role deletions. info!("processing role deletions"); + let mut xact = client.transaction()?; for op in ops { // We do not check either role exists or not, // Postgres will take care of it for us @@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< xact.execute(query.as_str(), &[])?; } } + xact.commit()?; } Ok(()) @@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // XXX: with a limited number of databases it is fine, but consider making it a HashMap let pg_db = existing_dbs.iter().find(|r| r.name == *name); + let start_time = Instant::now(); if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. @@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { info_print!(" -> update"); client.execute(query.as_str(), &[])?; + let elapsed = start_time.elapsed().as_millis(); + info_print!(" ({} ms)", elapsed); } } else { let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); @@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { query.push_str(&db.to_pg_options()); client.execute(query.as_str(), &[])?; + + let elapsed = start_time.elapsed().as_millis(); + info_print!(" ({} ms)", elapsed); } info_print!("\n"); diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 180508a01a..1c6cd6d882 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -2,6 +2,7 @@ name = "control_plane" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 8909e27c94..1f3f8f45ea 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -136,22 +136,6 @@ where anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds"); } -/// Send SIGTERM to child process -pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> { - let pid = child.id(); - match kill( - nix::unistd::Pid::from_raw(pid.try_into().unwrap()), - Signal::SIGTERM, - ) { - Ok(()) => Ok(()), - Err(Errno::ESRCH) => { - println!("child process with pid {pid} does not exist"); - Ok(()) - } - Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"), - } -} - /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> { let pid = match pid_file::read(pid_file) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 71de741640..4b2aa3c957 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } else if let Some(default_id) = env.default_tenant_id { Ok(default_id) } else { - bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); + anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant"); } } @@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let initial_timeline_id_arg = parse_timeline_id(init_match)?; - // Create config file let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { // load and parse the file @@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init(pg_version) .context("Failed to initialize neon repository")?; - let initial_tenant_id = env - .default_tenant_id - .expect("default_tenant_id should be generated by the `env.init()` call above"); // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); - let initial_timeline_id = pageserver - .initialize( - Some(initial_tenant_id), - initial_timeline_id_arg, - &pageserver_config_overrides(init_match), - pg_version, - ) + pageserver + .initialize(&pageserver_config_overrides(init_match)) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e:?}"); exit(1); }); - env.register_branch_mapping( - DEFAULT_BRANCH_NAME.to_owned(), - initial_tenant_id, - initial_timeline_id, - )?; - Ok(env) } @@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an println!( "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", ); + + if create_match.get_flag("set-default") { + println!("Setting tenant {new_tenant_id} as a default one"); + env.default_tenant_id = Some(new_tenant_id); + } + } + Some(("set-default", set_default_match)) => { + let tenant_id = + parse_tenant_id(set_default_match)?.context("No tenant id specified")?; + println!("Setting tenant {tenant_id} as a default one"); + env.default_tenant_id = Some(tenant_id); } Some(("config", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; @@ -928,9 +923,8 @@ fn cli() -> Command { .version(GIT_VERSION) .subcommand( Command::new("init") - .about("Initialize a new Neon repository") + .about("Initialize a new Neon repository, preparing configs for services to start with") .arg(pageserver_config_args.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( Arg::new("config") .long("config") @@ -992,11 +986,14 @@ fn cli() -> Command { .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) .arg(pg_version_arg.clone()) + .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false) + .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) ) + .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) + .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) - ) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) ) .subcommand( Command::new("pageserver") diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ea936640ec..003152c578 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -296,11 +296,6 @@ impl LocalEnv { env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } - // If no initial tenant ID was given, generate it. - if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(TenantId::generate()); - } - env.base_data_dir = base_path(); Ok(env) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 68e94b2fdc..9cebe028e4 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use std::process::{Child, Command}; use std::{io, result}; -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; @@ -130,83 +130,15 @@ impl PageServerNode { overrides } - /// Initializes a pageserver node by creating its config with the overrides provided, - /// and creating an initial tenant and timeline afterwards. - pub fn initialize( - &self, - create_tenant: Option, - initial_timeline_id: Option, - config_overrides: &[&str], - pg_version: u32, - ) -> anyhow::Result { + /// Initializes a pageserver node by creating its config with the overrides provided. + pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { // First, run `pageserver --init` and wait for it to write a config into FS and exit. self.pageserver_init(config_overrides).with_context(|| { format!( "Failed to run init for pageserver node {}", self.env.pageserver.id, ) - })?; - - // Then, briefly start it fully to run HTTP commands on it, - // to create initial tenant and timeline. - // We disable the remote storage, since we stop pageserver right after the timeline creation, - // hence most of the uploads will either aborted or not started: no point to start them at all. - let disabled_remote_storage_override = "remote_storage={}"; - let mut pageserver_process = self - .start_node( - &[disabled_remote_storage_override], - // Previous overrides will be taken from the config created before, don't overwrite them. - false, - ) - .with_context(|| { - format!( - "Failed to start a process for pageserver node {}", - self.env.pageserver.id, - ) - })?; - - let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id, pg_version) - .context("Failed to create initial tenant and timeline for pageserver"); - match &init_result { - Ok(initial_timeline_id) => { - println!("Successfully initialized timeline {initial_timeline_id}") - } - Err(e) => eprintln!("{e:#}"), - } - background_process::send_stop_child_process(&pageserver_process)?; - - let exit_code = pageserver_process.wait()?; - ensure!( - exit_code.success(), - format!( - "pageserver init failed with exit code {:?}", - exit_code.code() - ) - ); - println!( - "Stopped pageserver {} process with pid {}", - self.env.pageserver.id, - pageserver_process.id(), - ); - init_result - } - - fn try_init_timeline( - &self, - new_tenant_id: Option, - new_timeline_id: Option, - pg_version: u32, - ) -> anyhow::Result { - let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; - let initial_timeline_info = self.timeline_create( - initial_tenant_id, - new_timeline_id, - None, - None, - Some(pg_version), - )?; - Ok(initial_timeline_info.timeline_id) + }) } pub fn repo_path(&self) -> PathBuf { diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000000..3a0fe36f87 --- /dev/null +++ b/deny.toml @@ -0,0 +1,90 @@ +# This file was auto-generated using `cargo deny init`. +# cargo-deny is a cargo plugin that lets you lint your project's +# dependency graph to ensure all your dependencies conform +# to your expectations and requirements. + +# Root options +targets = [] +all-features = false +no-default-features = false +feature-depth = 1 + +# This section is considered when running `cargo deny check advisories` +# More documentation for the advisories section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html +[advisories] +db-urls = ["https://github.com/rustsec/advisory-db"] +vulnerability = "deny" +unmaintained = "warn" +yanked = "warn" +notice = "warn" +ignore = [] + +# This section is considered when running `cargo deny check licenses` +# More documentation for the licenses section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html +[licenses] +unlicensed = "deny" +allow = [ + "Apache-2.0", + "Artistic-2.0", + "BSD-2-Clause", + "BSD-3-Clause", + "ISC", + "MIT", + "MPL-2.0", + "OpenSSL", + "Unicode-DFS-2016", +] +deny = [] +copyleft = "warn" +allow-osi-fsf-free = "neither" +default = "deny" +confidence-threshold = 0.8 +exceptions = [ + # Zlib license has some restrictions if we decide to change sth + { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" }, + { allow = ["Zlib"], name = "const_format", version = "*" }, +] + +[[licenses.clarify]] +name = "ring" +version = "*" +expression = "MIT AND ISC AND OpenSSL" +license-files = [ + { path = "LICENSE", hash = 0xbd0eed23 }, +] + +[licenses.private] +ignore = true +registries = [] + +# This section is considered when running `cargo deny check bans`. +# More documentation about the 'bans' section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html +[bans] +multiple-versions = "warn" +wildcards = "allow" +highlight = "all" +workspace-default-features = "allow" +external-default-features = "allow" +allow = [] +deny = [] +skip = [] +skip-tree = [] + +# This section is considered when running `cargo deny check sources`. +# More documentation about the 'sources' section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html +[sources] +unknown-registry = "warn" +unknown-git = "warn" +allow-registry = ["https://github.com/rust-lang/crates.io-index"] +allow-git = [] + +[sources.allow-org] +github = [ + "neondatabase", +] +gitlab = [] +bitbucket = [] diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index d0cd46d2a9..d155f1e07d 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -2,6 +2,7 @@ name = "metrics" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 2102ae5373..68d4c609f0 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -2,6 +2,7 @@ name = "pageserver_api" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] serde = { version = "1.0", features = ["derive"] } diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 1924b260fa..12b7abcc93 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -2,6 +2,7 @@ name = "postgres_connection" version = "0.1.0" edition = "2021" +license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 59eec3de32..aa076b08d3 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -2,6 +2,7 @@ name = "postgres_ffi" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] rand = "0.8.3" diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index dd9f82a87a..abfc263550 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -2,7 +2,7 @@ name = "wal_craft" version = "0.1.0" edition = "2021" - +license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 76d8fbf28d..b9c6a1eab0 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -2,6 +2,7 @@ name = "pq_proto" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" @@ -12,5 +13,6 @@ rand = "0.8.3" serde = { version = "1.0", features = ["derive"] } tokio = { version = "1.17", features = ["macros"] } tracing = "0.1" +thiserror = "1.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 278f044c15..c5e4dbd1f0 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -5,7 +5,7 @@ // Tools for calling certain async methods in sync contexts. pub mod sync; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; @@ -194,6 +194,35 @@ macro_rules! retry_read { }; } +/// An error occured during connection being open. +#[derive(thiserror::Error, Debug)] +pub enum ConnectionError { + /// IO error during writing to or reading from the connection socket. + #[error("Socket IO error: {0}")] + Socket(std::io::Error), + /// Invalid packet was received from client + #[error("Protocol error: {0}")] + Protocol(String), + /// Failed to parse a protocol mesage + #[error("Message parse error: {0}")] + MessageParse(anyhow::Error), +} + +impl From for ConnectionError { + fn from(e: anyhow::Error) -> Self { + Self::MessageParse(e) + } +} + +impl ConnectionError { + pub fn into_io_error(self) -> io::Error { + match self { + ConnectionError::Socket(io) => io, + other => io::Error::new(io::ErrorKind::Other, other.to_string()), + } + } +} + impl FeMessage { /// Read one message from the stream. /// This function returns `Ok(None)` in case of EOF. @@ -216,7 +245,9 @@ impl FeMessage { /// } /// ``` #[inline(never)] - pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result> { + pub fn read( + stream: &mut (impl io::Read + Unpin), + ) -> Result, ConnectionError> { Self::read_fut(&mut AsyncishRead(stream)).wait() } @@ -224,7 +255,7 @@ impl FeMessage { /// See documentation for `Self::read`. pub fn read_fut( stream: &mut Reader, - ) -> SyncFuture>> + '_> + ) -> SyncFuture, ConnectionError>> + '_> where Reader: tokio::io::AsyncRead + Unpin, { @@ -238,17 +269,21 @@ impl FeMessage { let tag = match retry_read!(stream.read_u8().await) { Ok(b) => b, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e.into()), + Err(e) => return Err(ConnectionError::Socket(e)), }; // The message length includes itself, so it better be at least 4. - let len = retry_read!(stream.read_u32().await)? + let len = retry_read!(stream.read_u32().await) + .map_err(ConnectionError::Socket)? .checked_sub(4) - .context("invalid message length")?; + .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; let body = { let mut buffer = vec![0u8; len as usize]; - stream.read_exact(&mut buffer).await?; + stream + .read_exact(&mut buffer) + .await + .map_err(ConnectionError::Socket)?; Bytes::from(buffer) }; @@ -265,7 +300,11 @@ impl FeMessage { b'c' => Ok(Some(FeMessage::CopyDone)), b'f' => Ok(Some(FeMessage::CopyFail)), b'p' => Ok(Some(FeMessage::PasswordMessage(body))), - tag => bail!("unknown message tag: {},'{:?}'", tag, body), + tag => { + return Err(ConnectionError::Protocol(format!( + "unknown message tag: {tag},'{body:?}'" + ))) + } } }) } @@ -275,7 +314,9 @@ impl FeStartupPacket { /// Read startup message from the stream. // XXX: It's tempting yet undesirable to accept `stream` by value, // since such a change will cause user-supplied &mut references to be consumed - pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result> { + pub fn read( + stream: &mut (impl io::Read + Unpin), + ) -> Result, ConnectionError> { Self::read_fut(&mut AsyncishRead(stream)).wait() } @@ -284,7 +325,7 @@ impl FeStartupPacket { // since such a change will cause user-supplied &mut references to be consumed pub fn read_fut( stream: &mut Reader, - ) -> SyncFuture>> + '_> + ) -> SyncFuture, ConnectionError>> + '_> where Reader: tokio::io::AsyncRead + Unpin, { @@ -302,31 +343,41 @@ impl FeStartupPacket { let len = match retry_read!(stream.read_u32().await) { Ok(len) => len as usize, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e.into()), + Err(e) => return Err(ConnectionError::Socket(e)), }; #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { - bail!("invalid message length"); + return Err(ConnectionError::Protocol(format!( + "invalid message length {len}" + ))); } - let request_code = retry_read!(stream.read_u32().await)?; + let request_code = + retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; // the rest of startup packet are params let params_len = len - 8; let mut params_bytes = vec![0u8; params_len]; - stream.read_exact(params_bytes.as_mut()).await?; + stream + .read_exact(params_bytes.as_mut()) + .await + .map_err(ConnectionError::Socket)?; // Parse params depending on request code let req_hi = request_code >> 16; let req_lo = request_code & ((1 << 16) - 1); let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { - ensure!(params_len == 8, "expected 8 bytes for CancelRequest params"); + if params_len != 8 { + return Err(ConnectionError::Protocol( + "expected 8 bytes for CancelRequest params".to_string(), + )); + } let mut cursor = Cursor::new(params_bytes); FeStartupPacket::CancelRequest(CancelKeyData { - backend_pid: cursor.read_i32().await?, - cancel_key: cursor.read_i32().await?, + backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, + cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, }) } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { @@ -338,7 +389,9 @@ impl FeStartupPacket { FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { - bail!("Unrecognized request code {}", unrecognized_code) + return Err(ConnectionError::Protocol(format!( + "Unrecognized request code {unrecognized_code}" + ))); } // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { @@ -346,15 +399,21 @@ impl FeStartupPacket { // See `postgres: ProcessStartupPacket, build_startup_packet`. let mut tokens = str::from_utf8(¶ms_bytes) .context("StartupMessage params: invalid utf-8")? - .strip_suffix('\0') // drop packet's own null terminator - .context("StartupMessage params: missing null terminator")? + .strip_suffix('\0') // drop packet's own null + .ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })? .split_terminator('\0'); let mut params = HashMap::new(); while let Some(name) = tokens.next() { - let value = tokens - .next() - .context("StartupMessage params: key without value")?; + let value = tokens.next().ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: key without value".to_string(), + ) + })?; params.insert(name.to_owned(), value.to_owned()); } @@ -458,7 +517,7 @@ pub enum BeMessage<'a> { CloseComplete, // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), - ErrorResponse(&'a str), + ErrorResponse(&'a str, Option<&'a [u8; 5]>), /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, @@ -606,7 +665,7 @@ fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { } /// Safe write of s into buf as cstring (String in the protocol). -fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> { +fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { let bytes = s.as_ref(); if bytes.contains(&0) { return Err(io::Error::new( @@ -626,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result { Ok(result) } +pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; + impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut @@ -765,10 +826,7 @@ impl<'a> BeMessage<'a> { // First byte of each field represents type of this field. Set just enough fields // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error // message text. - BeMessage::ErrorResponse(error_msg) => { - // For all the errors set Severity to Error and error code to - // 'internal error'. - + BeMessage::ErrorResponse(error_msg, pg_error_code) => { // 'E' signalizes ErrorResponse messages buf.put_u8(b'E'); write_body(buf, |buf| { @@ -776,7 +834,9 @@ impl<'a> BeMessage<'a> { buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code - buf.put_slice(b"CXX000\0"); + buf.put_slice(&terminate_code( + pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR), + )); buf.put_u8(b'M'); // the message write_cstr(error_msg, buf)?; @@ -799,7 +859,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code - buf.put_slice(b"CXX000\0"); + buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR)); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; @@ -1087,3 +1147,12 @@ mod tests { let _ = FeStartupPacket::read_fut(stream).await; } } + +fn terminate_code(code: &[u8; 5]) -> [u8; 6] { + let mut terminated = [0; 6]; + for (i, &elem) in code.iter().enumerate() { + terminated[i] = elem; + } + + terminated +} diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index ebd30fc1eb..5a39f27209 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -2,6 +2,7 @@ name = "remote_storage" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 15bdecd71d..32cda78be4 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -2,6 +2,7 @@ name = "safekeeper_api" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] serde = { version = "1.0", features = ["derive"] } diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index 1aabf5a4f9..3a1a0f7915 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -3,6 +3,7 @@ name = "tenant_size_model" version = "0.1.0" edition = "2021" publish = false +license = "Apache-2.0" [dependencies] workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 47639e8205..9c7fcafe23 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -2,9 +2,10 @@ name = "utils" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] -sentry = "0.29.0" +sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } async-trait = "0.1" anyhow = "1.0" bincode = "1.3" diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 5b34c7adfb..f3e3835bda 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -3,11 +3,11 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. +use crate::postgres_backend_async::{log_query_error, short_error, QueryError}; use crate::sock_split::{BidiStream, ReadStream, WriteStream}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use bytes::{Bytes, BytesMut}; use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; -use rand::Rng; use serde::{Deserialize, Serialize}; use std::fmt; use std::io::{self, Write}; @@ -22,25 +22,32 @@ pub trait Handler { /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). - fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError>; /// Called on startup packet receival, allows to process params. /// /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow /// to override whole init logic in implementations. - fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { Ok(()) } - /// Check auth md5 - fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { - bail!("MD5 auth failed") - } - /// Check auth jwt - fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { - bail!("JWT auth failed") + fn check_auth_jwt( + &mut self, + _pgb: &mut PostgresBackend, + _jwt_response: &[u8], + ) -> Result<(), QueryError> { + Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) } fn is_shutdown_requested(&self) -> bool { @@ -61,7 +68,6 @@ pub enum ProtoState { #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum AuthType { Trust, - MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT NeonJWT, } @@ -72,9 +78,8 @@ impl FromStr for AuthType { fn from_str(s: &str) -> Result { match s { "Trust" => Ok(Self::Trust), - "MD5" => Ok(Self::MD5), "NeonJWT" => Ok(Self::NeonJWT), - _ => bail!("invalid value \"{s}\" for auth type"), + _ => anyhow::bail!("invalid value \"{s}\" for auth type"), } } } @@ -83,7 +88,6 @@ impl fmt::Display for AuthType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { AuthType::Trust => "Trust", - AuthType::MD5 => "MD5", AuthType::NeonJWT => "NeonJWT", }) } @@ -134,7 +138,6 @@ pub struct PostgresBackend { pub state: ProtoState, - md5_salt: [u8; 4], auth_type: AuthType, peer_addr: SocketAddr, @@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { } // Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> Result<&str> { +fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -187,7 +190,6 @@ impl PostgresBackend { stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))), buf_out: BytesMut::with_capacity(10 * 1024), state: ProtoState::Initialization, - md5_salt: [0u8; 4], auth_type, tls_config, peer_addr, @@ -199,10 +201,10 @@ impl PostgresBackend { } /// Get direct reference (into the Option) to the read stream. - fn get_stream_in(&mut self) -> Result<&mut BidiStream> { + fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> { match &mut self.stream { Some(Stream::Bidirectional(stream)) => Ok(stream), - _ => bail!("reader taken"), + _ => anyhow::bail!("reader taken"), } } @@ -226,7 +228,7 @@ impl PostgresBackend { } /// Read full message or return None if connection is closed. - pub fn read_message(&mut self) -> Result> { + pub fn read_message(&mut self) -> Result, QueryError> { let (state, stream) = (self.state, self.get_stream_in()?); use ProtoState::*; @@ -234,6 +236,7 @@ impl PostgresBackend { Initialization | Encrypted => FeStartupPacket::read(stream), Authentication | Established => FeMessage::read(stream), } + .map_err(QueryError::from) } /// Write message into internal output buffer. @@ -257,7 +260,7 @@ impl PostgresBackend { } // Wrapper for run_message_loop() that shuts down socket when we are done - pub fn run(mut self, handler: &mut impl Handler) -> Result<()> { + pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> { let ret = self.run_message_loop(handler); if let Some(stream) = self.stream.as_mut() { let _ = stream.shutdown(Shutdown::Both); @@ -265,7 +268,7 @@ impl PostgresBackend { ret } - fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> { + fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); let mut unnamed_query_string = Bytes::new(); @@ -274,7 +277,7 @@ impl PostgresBackend { match self.read_message() { Ok(message) => { if let Some(msg) = message { - trace!("got message {:?}", msg); + trace!("got message {msg:?}"); match self.process_message(handler, msg, &mut unnamed_query_string)? { ProcessMsgResult::Continue => continue, @@ -285,10 +288,12 @@ impl PostgresBackend { } } Err(e) => { - // If it is a timeout error, continue the loop - if !is_socket_read_timed_out(&e) { - return Err(e); + if let QueryError::Other(e) = &e { + if is_socket_read_timed_out(e) { + continue; + } } + return Err(e); } } } @@ -306,7 +311,7 @@ impl PostgresBackend { } stream => { self.stream = stream; - bail!("can't start TLs without bidi stream"); + anyhow::bail!("can't start TLs without bidi stream"); } } } @@ -316,17 +321,16 @@ impl PostgresBackend { handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, - ) -> Result { + ) -> Result { // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth // TODO: change that to proper top-level match of protocol state with separate message handling for each state - if self.state < ProtoState::Established { - ensure!( - matches!( - msg, - FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) - ), - "protocol violation" - ); + if self.state < ProtoState::Established + && !matches!( + msg, + FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) + ) + { + return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); } let have_tls = self.tls_config.is_some(); @@ -350,8 +354,13 @@ impl PostgresBackend { } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; - bail!("client did not connect with TLS"); + self.write_message(&BeMessage::ErrorResponse( + "must connect with TLS", + None, + ))?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); } // NB: startup() may change self.auth_type -- we are using that in proxy code @@ -367,13 +376,6 @@ impl PostgresBackend { .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } - AuthType::MD5 => { - rand::thread_rng().fill(&mut self.md5_salt); - self.write_message(&BeMessage::AuthenticationMD5Password( - self.md5_salt, - ))?; - self.state = ProtoState::Authentication; - } AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; @@ -393,20 +395,15 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => unreachable!(), - AuthType::MD5 => { - let (_, md5_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_md5(self, md5_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); - } - } AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); } } } @@ -420,33 +417,14 @@ impl PostgresBackend { // remove null terminator let query_string = cstr_to_str(&body)?; - trace!("got query {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string) { - // ":?" uses the alternate formatting style, which makes anyhow display the - // full cause of the error, not just the top-level context + its trace. - // We don't want to send that in the ErrorResponse though, - // because it's not relevant to the compute node logs. - // - // We also don't want to log full stacktrace when the error is primitive, - // such as usual connection closed. - let short_error = format!("{:#}", e); - let root_cause = e.root_cause().to_string(); - if root_cause.contains("connection closed unexpectedly") - || root_cause.contains("Broken pipe (os error 32)") - { - error!( - "query handler for '{}' failed: {}", - query_string, short_error - ); - } else { - error!("query handler for '{}' failed: {:?}", query_string, e); - } - self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; - // TODO: untangle convoluted control flow - if e.to_string().contains("failed to run") { - return Ok(ProcessMsgResult::Break); - } + log_query_error(query_string, &e); + let short_error = short_error(&e); + self.write_message_noflush(&BeMessage::ErrorResponse( + &short_error, + Some(e.pg_error_code()), + ))?; } self.write_message(&BeMessage::ReadyForQuery)?; } @@ -471,11 +449,13 @@ impl PostgresBackend { FeMessage::Execute(_) => { let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string) { - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + log_query_error(query_string, &e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; } // NOTE there is no ReadyForQuery message. This handler is used // for basebackup and it uses CopyOut which doesn't require @@ -494,7 +474,9 @@ impl PostgresBackend { // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - bail!("unexpected message type: {:?}", msg); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message type: {msg:?}" + ))); } } diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index a22774c69e..95b7b3fd15 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -4,45 +4,87 @@ //! is rather narrow, but we can extend it once required. use crate::postgres_backend::AuthType; -use anyhow::{bail, Context, Result}; -use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; -use rand::Rng; +use anyhow::Context; +use bytes::{Buf, Bytes, BytesMut}; +use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; use std::future::Future; +use std::io; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; use std::task::Poll; -use tracing::{debug, error, trace}; +use tracing::{debug, error, info, trace}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; use tokio_rustls::TlsAcceptor; +pub fn is_expected_io_error(e: &io::Error) -> bool { + use io::ErrorKind::*; + matches!( + e.kind(), + ConnectionRefused | ConnectionAborted | ConnectionReset + ) +} + +/// An error, occurred during query processing: +/// either during the connection ([`ConnectionError`]) or before/after it. +#[derive(thiserror::Error, Debug)] +pub enum QueryError { + /// The connection was lost while processing the query. + #[error(transparent)] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for QueryError { + fn from(e: io::Error) -> Self { + Self::Disconnected(ConnectionError::Socket(e)) + } +} + +impl QueryError { + pub fn pg_error_code(&self) -> &'static [u8; 5] { + match self { + Self::Disconnected(_) => b"08006", // connection failure + Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error + } + } +} + #[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). - async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError>; /// Called on startup packet receival, allows to process params. /// /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow /// to override whole init logic in implementations. - fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { Ok(()) } - /// Check auth md5 - fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { - bail!("MD5 auth failed") - } - /// Check auth jwt - fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { - bail!("JWT auth failed") + fn check_auth_jwt( + &mut self, + _pgb: &mut PostgresBackend, + _jwt_response: &[u8], + ) -> Result<(), QueryError> { + Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) } } @@ -76,17 +118,14 @@ impl AsyncWrite for Stream { self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), Self::Broken => unreachable!(), } } - fn poll_flush( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { + fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), Self::Tls(stream) => Pin::new(stream).poll_flush(cx), @@ -96,7 +135,7 @@ impl AsyncWrite for Stream { fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), @@ -109,7 +148,7 @@ impl AsyncRead for Stream { self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), @@ -120,12 +159,14 @@ impl AsyncRead for Stream { pub struct PostgresBackend { stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + // The data between 0 and "current position" as tracked by the bytes::Buf + // implementation of BytesMut, have already been written. buf_out: BytesMut, pub state: ProtoState, - md5_salt: [u8; 4], auth_type: AuthType, peer_addr: SocketAddr, @@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec { } // Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> Result<&str> { +fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -153,14 +194,13 @@ impl PostgresBackend { socket: tokio::net::TcpStream, auth_type: AuthType, tls_config: Option>, - ) -> std::io::Result { + ) -> io::Result { let peer_addr = socket.peer_addr()?; Ok(Self { stream: Stream::Unencrypted(BufReader::new(socket)), buf_out: BytesMut::with_capacity(10 * 1024), state: ProtoState::Initialization, - md5_salt: [0u8; 4], auth_type, tls_config, peer_addr, @@ -172,30 +212,68 @@ impl PostgresBackend { } /// Read full message or return None if connection is closed. - pub async fn read_message(&mut self) -> Result> { + pub async fn read_message(&mut self) -> Result, QueryError> { use ProtoState::*; match self.state { Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, Authentication | Established => FeMessage::read_fut(&mut self.stream).await, Closed => Ok(None), } + .map_err(QueryError::from) } /// Flush output buffer into the socket. - pub async fn flush(&mut self) -> std::io::Result<&mut Self> { - self.stream.write_all(&self.buf_out).await?; + pub async fn flush(&mut self) -> io::Result<()> { + while self.buf_out.has_remaining() { + let bytes_written = self.stream.write(self.buf_out.chunk()).await?; + self.buf_out.advance(bytes_written); + } self.buf_out.clear(); - Ok(self) + Ok(()) } /// Write message into internal output buffer. - pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buf_out, message)?; Ok(self) } + /// Returns an AsyncWrite implementation that wraps all the data written + /// to it in CopyData messages, and writes them to the connection + /// + /// The caller is responsible for sending CopyOutResponse and CopyDone messages. + pub fn copyout_writer(&mut self) -> CopyDataWriter { + CopyDataWriter { pgb: self } + } + + /// A polling function that tries to write all the data from 'buf_out' to the + /// underlying stream. + fn poll_write_buf( + &mut self, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + while self.buf_out.has_remaining() { + match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) { + Poll::Ready(Ok(bytes_written)) => { + self.buf_out.advance(bytes_written); + } + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + } + Poll::Ready(Ok(())) + } + + fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { + Pin::new(&mut self.stream).poll_flush(cx) + } + // Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + pub async fn run( + mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<(), QueryError> where F: Fn() -> S, S: Future, @@ -209,7 +287,7 @@ impl PostgresBackend { &mut self, handler: &mut impl Handler, shutdown_watcher: F, - ) -> Result<()> + ) -> Result<(), QueryError> where F: Fn() -> S, S: Future, @@ -245,7 +323,7 @@ impl PostgresBackend { return Ok(()); } } - Ok::<(), anyhow::Error>(()) + Ok::<(), QueryError>(()) } => { // Handshake complete. result?; @@ -290,14 +368,14 @@ impl PostgresBackend { self.stream = Stream::Tls(Box::new(tls_stream)); return Ok(()); }; - bail!("TLS already started"); + anyhow::bail!("TLS already started"); } async fn process_handshake_message( &mut self, handler: &mut impl Handler, msg: FeMessage, - ) -> Result { + ) -> Result { assert!(self.state < ProtoState::Established); let have_tls = self.tls_config.is_some(); match msg { @@ -320,8 +398,13 @@ impl PostgresBackend { } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; - bail!("client did not connect with TLS"); + self.write_message(&BeMessage::ErrorResponse( + "must connect with TLS", + None, + ))?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); } // NB: startup() may change self.auth_type -- we are using that in proxy code @@ -337,13 +420,6 @@ impl PostgresBackend { .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } - AuthType::MD5 => { - rand::thread_rng().fill(&mut self.md5_salt); - self.write_message(&BeMessage::AuthenticationMD5Password( - self.md5_salt, - ))?; - self.state = ProtoState::Authentication; - } AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; @@ -364,20 +440,15 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => unreachable!(), - AuthType::MD5 => { - let (_, md5_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_md5(self, md5_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); - } - } AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); } } } @@ -400,33 +471,28 @@ impl PostgresBackend { handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, - ) -> Result { + ) -> Result { // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth // TODO: change that to proper top-level match of protocol state with separate message handling for each state assert!(self.state == ProtoState::Established); match msg { FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { - bail!("protocol violation"); + return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); } FeMessage::Query(body) => { // remove null terminator let query_string = cstr_to_str(&body)?; - trace!("got query {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { - // ":?" uses the alternate formatting style, which makes anyhow display the - // full cause of the error, not just the top-level context + its trace. - // We don't want to send that in the ErrorResponse though, - // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - // TODO: untangle convoluted control flow - if e.to_string().contains("failed to run") { - return Ok(ProcessMsgResult::Break); - } + log_query_error(query_string, &e); + let short_error = short_error(&e); + self.write_message(&BeMessage::ErrorResponse( + &short_error, + Some(e.pg_error_code()), + ))?; } self.write_message(&BeMessage::ReadyForQuery)?; } @@ -451,11 +517,13 @@ impl PostgresBackend { FeMessage::Execute(_) => { let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + log_query_error(query_string, &e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; } // NOTE there is no ReadyForQuery message. This handler is used // for basebackup and it uses CopyOut which doesn't require @@ -474,10 +542,99 @@ impl PostgresBackend { // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - bail!("unexpected message type: {:?}", msg); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message type: {:?}", + msg + ))); } } Ok(ProcessMsgResult::Continue) } } + +/// +/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData +/// messages. +/// + +pub struct CopyDataWriter<'a> { + pgb: &'a mut PostgresBackend, +} + +impl<'a> AsyncWrite for CopyDataWriter<'a> { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + let this = self.get_mut(); + + // It's not strictly required to flush between each message, but makes it easier + // to view in wireshark, and usually the messages that the callers write are + // decently-sized anyway. + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + + // CopyData + // XXX: if the input is large, we should split it into multiple messages. + // Not sure what the threshold should be, but the ultimate hard limit is that + // the length cannot exceed u32. + this.pgb.write_message(&BeMessage::CopyData(buf))?; + + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + this.pgb.poll_flush(cx) + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + this.pgb.poll_flush(cx) + } +} + +pub fn short_error(e: &QueryError) -> String { + match e { + QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Other(e) => format!("{e:#}"), + } +} + +pub(super) fn log_query_error(query: &str, e: &QueryError) { + match e { + QueryError::Disconnected(ConnectionError::Socket(io_error)) => { + if is_expected_io_error(io_error) { + info!("query handler for '{query}' failed with expected io error: {io_error}"); + } else { + error!("query handler for '{query}' failed with io error: {io_error}"); + } + } + QueryError::Disconnected(other_connection_error) => { + error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + } + QueryError::Other(e) => { + error!("query handler for '{query}' failed: {e:?}"); + } + } +} diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 248400c2c1..fae707f049 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use once_cell::sync::Lazy; -use utils::postgres_backend::{AuthType, Handler, PostgresBackend}; +use utils::{ + postgres_backend::{AuthType, Handler, PostgresBackend}, + postgres_backend_async::QueryError, +}; fn make_tcp_pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); @@ -105,7 +108,7 @@ fn ssl() { &mut self, _pgb: &mut PostgresBackend, query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { self.got_query = query_string == QUERY; Ok(()) } @@ -152,7 +155,7 @@ fn no_ssl() { &mut self, _pgb: &mut PostgresBackend, _query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { panic!() } } @@ -212,7 +215,7 @@ fn server_forces_ssl() { &mut self, _pgb: &mut PostgresBackend, _query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { panic!() } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 12fe0705cf..395f450bb2 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -2,6 +2,7 @@ name = "pageserver" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [features] default = [] @@ -9,8 +10,6 @@ default = [] # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] -profiling = ["pprof"] - [dependencies] amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } anyhow = { version = "1.0", features = ["backtrace"] } @@ -39,7 +38,6 @@ pin-project-lite = "0.2.7" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } rand = "0.8.3" regex = "1.4.5" rstar = "0.9.3" @@ -49,7 +47,7 @@ serde_json = { version = "1.0", features = ["raw_value"] } serde_with = "2.0" signal-hook = "0.3.10" svg_fmt = "0.4.1" -tar = "0.4.33" +tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } thiserror = "1.0" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } @@ -70,7 +68,7 @@ tenant_size_model = { path = "../libs/tenant_size_model" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } rpds = "0.12.0" -reqwest = "0.11.13" +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } [dev-dependencies] criterion = "0.4" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 36664e119e..1978becf83 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,20 +10,24 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context}; use bytes::{BufMut, BytesMut}; use fail::fail_point; -use itertools::Itertools; use std::fmt::Write as FmtWrite; -use std::io; -use std::io::Write; -use std::sync::Arc; use std::time::SystemTime; -use tar::{Builder, EntryType, Header}; +use tokio::io; +use tokio::io::AsyncWrite; use tracing::*; -use crate::task_mgr; -use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline}; +/// NB: This relies on a modified version of tokio_tar that does *not* write the +/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped +/// without explicitly calling 'finish' or 'into_inner'! +/// +/// See https://github.com/neondatabase/tokio-tar/pull/1 +/// +use tokio_tar::{Builder, EntryType, Header}; + +use crate::tenant::{with_ondemand_download, Timeline}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; @@ -34,116 +38,130 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +/// Create basebackup with non-rel data in it. +/// Only include relational data if 'full_backup' is true. +/// +/// Currently we use empty 'req_lsn' in two cases: +/// * During the basebackup right after timeline creation +/// * When working without safekeepers. In this situation it is important to match the lsn +/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver +/// to start the replication. +pub async fn send_basebackup_tarball<'a, W>( + write: &'a mut W, + timeline: &'a Timeline, + req_lsn: Option, + prev_lsn: Option, + full_backup: bool, +) -> anyhow::Result<()> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + // Compute postgres doesn't have any previous WAL files, but the first + // record that it's going to write needs to include the LSN of the + // previous record (xl_prev). We include prev_record_lsn in the + // "zenith.signal" file, so that postgres can read it during startup. + // + // We don't keep full history of record boundaries in the page server, + // however, only the predecessor of the latest record on each + // timeline. So we can only provide prev_record_lsn when you take a + // base backup at the end of the timeline, i.e. at last_record_lsn. + // Even at the end of the timeline, we sometimes don't have a valid + // prev_lsn value; that happens if the timeline was just branched from + // an old LSN and it doesn't have any WAL of its own yet. We will set + // prev_lsn to Lsn(0) if we cannot provide the correct value. + let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. + + // If the requested point is the end of the timeline, we can + // provide prev_lsn. (get_last_record_rlsn() might return it as + // zero, though, if no WAL has been generated on this timeline + // yet.) + let end_of_timeline = timeline.get_last_record_rlsn(); + if req_lsn == end_of_timeline.last { + (end_of_timeline.prev, req_lsn) + } else { + (Lsn(0), req_lsn) + } + } else { + // Backup was requested at end of the timeline. + let end_of_timeline = timeline.get_last_record_rlsn(); + (end_of_timeline.prev, end_of_timeline.last) + }; + + // Consolidate the derived and the provided prev_lsn values + let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + if backup_prev != Lsn(0) { + ensure!(backup_prev == provided_prev_lsn); + } + provided_prev_lsn + } else { + backup_prev + }; + + info!( + "taking basebackup lsn={}, prev_lsn={} (full_backup={})", + backup_lsn, prev_lsn, full_backup + ); + + let basebackup = Basebackup { + ar: Builder::new_non_terminated(write), + timeline, + lsn: backup_lsn, + prev_record_lsn: prev_lsn, + full_backup, + }; + basebackup + .send_tarball() + .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn)) + .await +} + /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W> +struct Basebackup<'a, W> where - W: Write, + W: AsyncWrite + Send + Sync + Unpin, { - ar: Builder>, - timeline: &'a Arc, - pub lsn: Lsn, + ar: Builder<&'a mut W>, + timeline: &'a Timeline, + lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, - finished: bool, } -// Create basebackup with non-rel data in it. -// Only include relational data if 'full_backup' is true. -// -// Currently we use empty lsn in two cases: -// * During the basebackup right after timeline creation -// * When working without safekeepers. In this situation it is important to match the lsn -// we are taking basebackup on with the lsn that is used in pageserver's walreceiver -// to start the replication. impl<'a, W> Basebackup<'a, W> where - W: Write, + W: AsyncWrite + Send + Sync + Unpin, { - pub fn new( - write: W, - timeline: &'a Arc, - req_lsn: Option, - prev_lsn: Option, - full_backup: bool, - ) -> Result> { - // Compute postgres doesn't have any previous WAL files, but the first - // record that it's going to write needs to include the LSN of the - // previous record (xl_prev). We include prev_record_lsn in the - // "zenith.signal" file, so that postgres can read it during startup. - // - // We don't keep full history of record boundaries in the page server, - // however, only the predecessor of the latest record on each - // timeline. So we can only provide prev_record_lsn when you take a - // base backup at the end of the timeline, i.e. at last_record_lsn. - // Even at the end of the timeline, we sometimes don't have a valid - // prev_lsn value; that happens if the timeline was just branched from - // an old LSN and it doesn't have any WAL of its own yet. We will set - // prev_lsn to Lsn(0) if we cannot provide the correct value. - let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. The caller should've - // already checked that it's a valid LSN. - - // If the requested point is the end of the timeline, we can - // provide prev_lsn. (get_last_record_rlsn() might return it as - // zero, though, if no WAL has been generated on this timeline - // yet.) - let end_of_timeline = timeline.get_last_record_rlsn(); - if req_lsn == end_of_timeline.last { - (end_of_timeline.prev, req_lsn) - } else { - (Lsn(0), req_lsn) - } - } else { - // Backup was requested at end of the timeline. - let end_of_timeline = timeline.get_last_record_rlsn(); - (end_of_timeline.prev, end_of_timeline.last) - }; - - // Consolidate the derived and the provided prev_lsn values - let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn) - } - provided_prev_lsn - } else { - backup_prev - }; - - info!( - "taking basebackup lsn={}, prev_lsn={} (full_backup={})", - backup_lsn, prev_lsn, full_backup - ); - - Ok(Basebackup { - ar: Builder::new(AbortableWrite::new(write)), - timeline, - lsn: backup_lsn, - prev_record_lsn: prev_lsn, - full_backup, - finished: false, - }) - } - - pub fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> anyhow::Result<()> { // TODO include checksum // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(dir)?; - self.ar.append(&header, &mut io::empty())?; + self.ar + .append(&header, &mut io::empty()) + .await + .context("could not add directory to basebackup tarball")?; } - // Send empty config files. + // Send config files. for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; - self.ar.append(&header, data)?; + self.ar + .append(&header, data) + .await + .context("could not add config file to basebackup tarball")?; } else { let header = new_tar_header(filepath, 0)?; - self.ar.append(&header, &mut io::empty())?; + self.ar + .append(&header, &mut io::empty()) + .await + .context("could not add config file to basebackup tarball")?; } } @@ -154,29 +172,30 @@ where SlruKind::MultiXactMembers, ] { for segno in - with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))? + with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await? { - self.add_slru_segment(kind, segno)?; + self.add_slru_segment(kind, segno).await?; } } // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in - with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))? + with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await? { - self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in with_ondemand_download_sync(|| { - self.timeline.list_rels(spcnode, dbnode, self.lsn) - })? { - self.add_rel(rel)?; + for rel in + with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn)) + .await? + { + self.add_rel(rel).await?; } } } - for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? { - self.add_twophase_file(xid)?; + for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? { + self.add_twophase_file(xid).await?; } fail_point!("basebackup-before-control-file", |_| { @@ -184,44 +203,46 @@ where }); // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file()?; - self.ar.finish()?; - self.finished = true; + self.add_pgcontrol_file().await?; + self.ar.finish().await?; debug!("all tarred up!"); Ok(()) } - fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { let nblocks = - with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?; - - // Function that adds relation segment data to archive - let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { - let file_name = tag.to_segfile_name(segment_index as u32); - let header = new_tar_header(&file_name, data.len() as u64)?; - self.ar.append(&header, data.as_slice())?; - Ok(()) - }; + with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?; // If the relation is empty, create an empty file if nblocks == 0 { - add_file(0, &vec![])?; + let file_name = tag.to_segfile_name(0); + let header = new_tar_header(&file_name, 0)?; + self.ar.append(&header, &mut io::empty()).await?; return Ok(()); } // Add a file for each chunk of blocks (aka segment) - let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); - for (seg, blocks) in chunks.into_iter().enumerate() { + let mut startblk = 0; + let mut seg = 0; + while startblk < nblocks { + let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks); + let mut segment_data: Vec = vec![]; - for blknum in blocks { - let img = self - .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false) - .no_ondemand_download()?; + for blknum in startblk..endblk { + let img = with_ondemand_download(|| { + self.timeline + .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + }) + .await?; segment_data.extend_from_slice(&img[..]); } - add_file(seg, &segment_data)?; + let file_name = tag.to_segfile_name(seg as u32); + let header = new_tar_header(&file_name, segment_data.len() as u64)?; + self.ar.append(&header, segment_data.as_slice()).await?; + + seg += 1; + startblk = endblk; } Ok(()) @@ -230,17 +251,18 @@ where // // Generate SLRU segment files from repository. // - fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = with_ondemand_download_sync(|| { - self.timeline.get_slru_segment_size(slru, segno, self.lsn) - })?; + async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { + let nblocks = + with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn)) + .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = with_ondemand_download_sync(|| { + let img = with_ondemand_download(|| { self.timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) - })?; + }) + .await?; if slru == SlruKind::Clog { ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); @@ -253,7 +275,7 @@ where let segname = format!("{}/{:>04X}", slru.to_str(), segno); let header = new_tar_header(&segname, slru_buf.len() as u64)?; - self.ar.append(&header, slru_buf.as_slice())?; + self.ar.append(&header, slru_buf.as_slice()).await?; trace!("Added to basebackup slru {} relsize {}", segname, nblocks); Ok(()) @@ -265,16 +287,16 @@ where // Each directory contains a PG_VERSION file, and the default database // directories also contain pg_filenode.map files. // - fn add_dbdir( + async fn add_dbdir( &mut self, spcnode: u32, dbnode: u32, has_relmap_file: bool, ) -> anyhow::Result<()> { let relmap_img = if has_relmap_file { - let img = with_ondemand_download_sync(|| { - self.timeline.get_relmap_file(spcnode, dbnode, self.lsn) - })?; + let img = + with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)) + .await?; ensure!(img.len() == 512); Some(img) } else { @@ -284,14 +306,14 @@ where if spcnode == GLOBALTABLESPACE_OID { let pg_version_str = self.timeline.pg_version.to_string(); let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes())?; + self.ar.append(&header, pg_version_str.as_bytes()).await?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..])?; + self.ar.append(&header, &img[..]).await?; } else { warn!("global/pg_filenode.map is missing"); } @@ -307,10 +329,8 @@ where // XLOG_TBLSPC_DROP records. But we probably should just // throw an error on CREATE TABLESPACE in the first place. if !has_relmap_file - && self - .timeline - .list_rels(spcnode, dbnode, self.lsn) - .no_ondemand_download()? + && with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn)) + .await? .is_empty() { return Ok(()); @@ -321,18 +341,18 @@ where // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty())?; + self.ar.append(&header, &mut io::empty()).await?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); let pg_version_str = self.timeline.pg_version.to_string(); let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes())?; + self.ar.append(&header, pg_version_str.as_bytes()).await?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..])?; + self.ar.append(&header, &img[..]).await?; } }; Ok(()) @@ -341,8 +361,8 @@ where // // Extract twophase state files // - fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?; + async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -350,7 +370,7 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..])?; + self.ar.append(&header, &buf[..]).await?; Ok(()) } @@ -359,7 +379,7 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -371,17 +391,19 @@ where } else { write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; } - self.ar.append( - &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, - zenith_signal.as_bytes(), - )?; + self.ar + .append( + &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, + zenith_signal.as_bytes(), + ) + .await?; - let checkpoint_bytes = - with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn)) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = - with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn)) - .context("failed get control bytes")?; + let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn)) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn)) + .await + .context("failed get control bytes")?; let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( &pg_control_bytes, @@ -392,7 +414,7 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..])?; + self.ar.append(&header, &pg_control_bytes[..]).await?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -404,24 +426,11 @@ where postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..])?; + self.ar.append(&header, &wal_seg[..]).await?; Ok(()) } } -impl<'a, W> Drop for Basebackup<'a, W> -where - W: Write, -{ - /// If the basebackup was not finished, prevent the Archive::drop() from - /// writing the end-of-archive marker. - fn drop(&mut self) { - if !self.finished { - self.ar.get_mut().abort(); - } - } -} - // // Create new tarball entry header // @@ -457,57 +466,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } - -/// A wrapper that passes through all data to the underlying Write, -/// until abort() is called. -/// -/// tar::Builder has an annoying habit of finishing the archive with -/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), -/// even if an error occurs and we don't finish building the archive. -/// We'd rather abort writing the tarball immediately than construct -/// a seemingly valid but incomplete archive. This wrapper allows us -/// to swallow the end-of-archive marker that Builder::drop() emits, -/// without writing it to the underlying sink. -/// -struct AbortableWrite { - w: W, - aborted: bool, -} - -impl AbortableWrite { - pub fn new(w: W) -> Self { - AbortableWrite { w, aborted: false } - } - - pub fn abort(&mut self) { - self.aborted = true; - } -} - -impl Write for AbortableWrite -where - W: Write, -{ - fn write(&mut self, data: &[u8]) -> io::Result { - if self.aborted { - Ok(data.len()) - } else { - self.w.write(data) - } - } - fn flush(&mut self) -> io::Result<()> { - if self.aborted { - Ok(()) - } else { - self.w.flush() - } - } -} - -fn with_ondemand_download_sync(f: F) -> anyhow::Result -where - F: Send + Fn() -> PageReconstructResult, - T: Send, -{ - task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f)) -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b3d9b0f809..18ec1ac68b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -13,7 +13,7 @@ use tracing::*; use metrics::set_build_info_metric; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, task_mgr, + http, page_cache, page_service, task_mgr, task_mgr::TaskKind, task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, @@ -40,8 +40,6 @@ const FEATURES: &[&str] = &[ "testing", #[cfg(feature = "fail/failpoints")] "fail/failpoints", - #[cfg(feature = "profiling")] - "profiling", ]; fn version() -> String { @@ -247,15 +245,12 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { // Install signal handlers let signals = signals::install_shutdown_handlers()?; - // Start profiler (if enabled) - let profiler_guard = profiling::init_profiler(conf); - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?; // Initialize authentication for incoming connections let auth = match &conf.auth_type { - AuthType::Trust | AuthType::MD5 => None, + AuthType::Trust => None, AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); @@ -372,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { "Got {}. Terminating in immediate shutdown mode", signal.name() ); - profiling::exit_profiler(conf, &profiler_guard); std::process::exit(111); } @@ -381,7 +375,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); - profiling::exit_profiler(conf, &profiler_guard); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index deb79531a4..7b99d98581 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -138,7 +138,6 @@ pub struct PageServerConf { pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, - pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, /// Storage broker endpoints to connect to. @@ -165,25 +164,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ProfilingConfig { - Disabled, - PageRequests, -} - -impl FromStr for ProfilingConfig { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - let result = match s { - "disabled" => ProfilingConfig::Disabled, - "page_requests" => ProfilingConfig::PageRequests, - _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""), - }; - Ok(result) - } -} - // use dedicated enum for builder to better indicate the intention // and avoid possible confusion with nested options pub enum BuilderValue { @@ -226,7 +206,6 @@ struct PageServerConfigBuilder { id: BuilderValue, - profiling: BuilderValue, broker_endpoint: BuilderValue, broker_keepalive_interval: BuilderValue, @@ -262,7 +241,6 @@ impl Default for PageServerConfigBuilder { auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), id: NotSet, - profiling: Set(ProfilingConfig::Disabled), broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), @@ -348,10 +326,6 @@ impl PageServerConfigBuilder { self.id = BuilderValue::Set(node_id) } - pub fn profiling(&mut self, profiling: ProfilingConfig) { - self.profiling = BuilderValue::Set(profiling) - } - pub fn log_format(&mut self, log_format: LogFormat) { self.log_format = BuilderValue::Set(log_format) } @@ -405,7 +379,6 @@ impl PageServerConfigBuilder { .remote_storage_config .ok_or(anyhow!("missing remote_storage_config"))?, id: self.id.ok_or(anyhow!("missing id"))?, - profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), broker_endpoint: self @@ -588,7 +561,6 @@ impl PageServerConf { t_conf = Self::parse_toml_tenant_conf(item)?; } "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), - "profiling" => builder.profiling(parse_toml_from_str(key, item)?), "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -722,7 +694,6 @@ impl PageServerConf { auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5000), @@ -898,7 +869,6 @@ log_format = 'json' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: humantime::parse_duration( @@ -949,7 +919,6 @@ log_format = 'json' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), broker_keepalive_interval: Duration::from_secs(5), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4f4c397abe..1c5eacd362 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -738,17 +738,17 @@ async fn timeline_compact_handler(request: Request) -> Result = result_receiver + .await + .context("receive compaction result") + .map_err(ApiError::InternalServerError)?; + result.map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 588b92c13f..ca1514dd00 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,12 +2,13 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; +use futures::StreamExt; +use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio_tar::Archive; use tracing::*; use walkdir::WalkDir; @@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub async fn import_timeline_from_postgres_datadir( tline: &Timeline, pgdata_path: &Path, pgdata_lsn: Lsn, @@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir( let absolute_path = entry.path(); let relative_path = absolute_path.strip_prefix(pgdata_path)?; - let file = File::open(absolute_path)?; + let mut file = tokio::fs::File::open(absolute_path).await?; let len = metadata.len() as usize; - if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { + if let Some(control_file) = + import_file(&mut modification, relative_path, &mut file, len).await? + { pg_control = Some(control_file); } modification.flush()?; @@ -96,18 +99,19 @@ pub fn import_timeline_from_postgres_datadir( tline, Lsn(pg_control.checkPointCopy.redo), pgdata_lsn, - )?; + ) + .await?; Ok(()) } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +async fn import_rel( + modification: &mut DatadirModification<'_>, path: &Path, spcoid: Oid, dboid: Oid, - mut reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ) -> anyhow::Result<()> { // Does it look like a relation file? @@ -148,7 +152,7 @@ fn import_rel( } loop { - let r = reader.read_exact(&mut buf); + let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; @@ -181,11 +185,11 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +async fn import_slru( + modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, - mut reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ) -> anyhow::Result<()> { info!("importing slru file {path:?}"); @@ -206,7 +210,7 @@ fn import_slru( let mut rpageno = 0; loop { - let r = reader.read_exact(&mut buf); + let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { modification.put_slru_page_image( @@ -237,19 +241,20 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +async fn import_wal( walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn, ) -> anyhow::Result<()> { + use std::io::Read; let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?; + let mut walingest = WalIngest::new(tline, startpoint).await?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -265,10 +270,11 @@ fn import_wal( } // Slurp the WAL file - let mut file = File::open(&path)?; + let mut file = std::fs::File::open(&path)?; if offset > 0 { - file.seek(SeekFrom::Start(offset as u64))?; + use std::io::Seek; + file.seek(std::io::SeekFrom::Start(offset as u64))?; } let nread = file.read_to_end(&mut buf)?; @@ -286,7 +292,7 @@ fn import_wal( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .no_ondemand_download()?; + .await?; last_lsn = lsn; nrecords += 1; @@ -310,9 +316,9 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( +pub async fn import_basebackup_from_tar( tline: &Timeline, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, ) -> Result<()> { info!("importing base at {base_lsn}"); @@ -322,21 +328,24 @@ pub fn import_basebackup_from_tar( let mut pg_control: Option = None; // Import base - for base_tar_entry in tar::Archive::new(reader).entries()? { - let entry = base_tar_entry?; + let mut entries = Archive::new(reader).entries()?; + while let Some(base_tar_entry) = entries.next().await { + let mut entry = base_tar_entry?; let header = entry.header(); let len = header.entry_size()? as usize; let file_path = header.path()?.into_owned(); match header.entry_type() { - tar::EntryType::Regular => { - if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? { + tokio_tar::EntryType::Regular => { + if let Some(res) = + import_file(&mut modification, file_path.as_ref(), &mut entry, len).await? + { // We found the pg_control file. pg_control = Some(res); } modification.flush()?; } - tar::EntryType::Directory => { + tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); } _ => { @@ -356,9 +365,9 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( +pub async fn import_wal_from_tar( tline: &Timeline, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), start_lsn: Lsn, end_lsn: Lsn, ) -> Result<()> { @@ -367,20 +376,23 @@ pub fn import_wal_from_tar( let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; - let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?; + let mut walingest = WalIngest::new(tline, start_lsn).await?; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); - let mut pg_wal_tar = tar::Archive::new(reader); - let mut pg_wal_entries_iter = pg_wal_tar.entries()?; + let mut pg_wal_tar = Archive::new(reader); + let mut pg_wal_entries = pg_wal_tar.entries()?; while last_lsn <= end_lsn { let bytes = { - let entry = pg_wal_entries_iter.next().expect("expected more wal")?; + let mut entry = pg_wal_entries + .next() + .await + .ok_or_else(|| anyhow::anyhow!("expected more wal"))??; let header = entry.header(); let file_path = header.path()?.into_owned(); match header.entry_type() { - tar::EntryType::Regular => { + tokio_tar::EntryType::Regular => { // FIXME: assume postgresql tli 1 for now let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let file_name = file_path @@ -390,9 +402,9 @@ pub fn import_wal_from_tar( ensure!(expected_filename == file_name); debug!("processing wal file {:?}", file_path); - read_all_bytes(entry)? + read_all_bytes(&mut entry).await? } - tar::EntryType::Directory => { + tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); continue; } @@ -414,7 +426,7 @@ pub fn import_wal_from_tar( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .no_ondemand_download()?; + .await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); @@ -433,7 +445,7 @@ pub fn import_wal_from_tar( } // Log any extra unused files - for e in &mut pg_wal_entries_iter { + while let Some(e) = pg_wal_entries.next().await { let entry = e?; let header = entry.header(); let file_path = header.path()?.into_owned(); @@ -443,10 +455,10 @@ pub fn import_wal_from_tar( Ok(()) } -fn import_file( - modification: &mut DatadirModification, +async fn import_file( + modification: &mut DatadirModification<'_>, file_path: &Path, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ) -> Result> { let file_name = match file_path.file_name() { @@ -466,7 +478,7 @@ fn import_file( match file_name.as_ref() { "pg_control" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&bytes[..])?; @@ -479,7 +491,7 @@ fn import_file( return Ok(Some(pg_control)); } "pg_filenode.map" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_relmap_file(spcnode, dbnode, bytes)?; debug!("imported relmap file") } @@ -487,7 +499,7 @@ fn import_file( debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; debug!("imported rel creation"); } } @@ -502,7 +514,7 @@ fn import_file( match file_name.as_ref() { "pg_filenode.map" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_relmap_file(spcnode, dbnode, bytes)?; debug!("imported relmap file") } @@ -510,36 +522,36 @@ fn import_file( debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; debug!("imported rel creation"); } } } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported clog slru"); } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported multixact offsets slru"); } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; debug!("imported twophase file"); } else if file_path.starts_with("pg_wal") { debug!("found wal file in base section. ignore it"); } else if file_path.starts_with("zenith.signal") { // Parse zenith signal file to set correct previous LSN - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; // zenith.signal format is "PREV LSN: prev_lsn" // TODO write serialization and deserialization in the same place. let zenith_signal = std::str::from_utf8(&bytes)?.trim(); @@ -576,8 +588,8 @@ fn import_file( Ok(None) } -fn read_all_bytes(mut reader: Reader) -> Result { +async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result { let mut buf: Vec = vec![]; - reader.read_to_end(&mut buf)?; + reader.read_to_end(&mut buf).await?; Ok(Bytes::copy_from_slice(&buf[..])) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 2f78c199b9..91cde477ad 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -9,7 +9,6 @@ pub(crate) mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; -pub mod profiling; pub mod repository; pub mod task_mgr; pub mod tenant; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 205ee0ffad..b61e64048b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -209,15 +209,34 @@ pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { // remote storage metrics -static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { +/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. +static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy::new(|| { register_int_gauge_vec!( - "pageserver_remote_upload_queue_unfinished_tasks", - "Number of tasks in the upload queue that are not finished yet.", + "pageserver_remote_timeline_client_calls_unfinished", + "Number of ongoing calls to remote timeline client. \ + Used to populate pageserver_remote_timeline_client_calls_started. \ + This metric is not useful for sampling from Prometheus, but useful in tests.", &["tenant_id", "timeline_id", "file_kind", "op_kind"], ) .expect("failed to define a metric") }); +static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_remote_timeline_client_calls_started", + "When calling a remote timeline client method, we record the current value \ + of the calls_unfinished gauge in this histogram. Plot the histogram \ + over time in a heatmap to visualize how many operations were ongoing \ + at a given instant. It gives you a better idea of the queue depth \ + than plotting the gauge directly, since operations may complete faster \ + than the sampling interval.", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. + vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -248,15 +267,12 @@ impl RemoteOpFileKind { } } -pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; -pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"]; -pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"]; - pub static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_operation_seconds", "Time spent on remote storage operations. \ - Grouped by tenant, timeline, operation_kind and status", + Grouped by tenant, timeline, operation_kind and status. \ + Does not account for time spent waiting in remote timeline client's queues.", &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"] ) .expect("failed to define a metric") @@ -475,21 +491,6 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } - - let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]); - for file_kind in REMOTE_OPERATION_FILE_KINDS { - for op in REMOTE_OPERATION_KINDS { - for status in REMOTE_OPERATION_STATUSES { - let _ = REMOTE_OPERATION_TIME.remove_label_values(&[ - tenant_id, - timeline_id, - file_kind, - op, - status, - ]); - } - } - } } } @@ -510,7 +511,8 @@ pub struct RemoteTimelineClientMetrics { timeline_id: String, remote_physical_size_gauge: Mutex>, remote_operation_time: Mutex>, - unfinished_tasks: Mutex>, + calls_unfinished_gauge: Mutex>, + calls_started_hist: Mutex>, } impl RemoteTimelineClientMetrics { @@ -519,7 +521,8 @@ impl RemoteTimelineClientMetrics { tenant_id: tenant_id.to_string(), timeline_id: timeline_id.to_string(), remote_operation_time: Mutex::new(HashMap::default()), - unfinished_tasks: Mutex::new(HashMap::default()), + calls_unfinished_gauge: Mutex::new(HashMap::default()), + calls_started_hist: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), } } @@ -558,16 +561,37 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } - pub fn unfinished_tasks( + fn calls_unfinished_gauge( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntGauge { // XXX would be nice to have an upgradable RwLock - let mut guard = self.unfinished_tasks.lock().unwrap(); + let mut guard = self.calls_unfinished_gauge.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { - REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS + REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } + + fn calls_started_hist( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Histogram { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.calls_started_hist.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST .get_metric_with_label_values(&[ &self.tenant_id.to_string(), &self.timeline_id.to_string(), @@ -580,6 +604,58 @@ impl RemoteTimelineClientMetrics { } } +/// See [`RemoteTimelineClientMetrics::call_begin`]. +#[must_use] +pub(crate) struct RemoteTimelineClientCallMetricGuard(Option); + +impl RemoteTimelineClientCallMetricGuard { + /// Consume this guard object without decrementing the metric. + /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out. + pub fn will_decrement_manually(mut self) { + self.0 = None; // prevent drop() from decrementing + } +} + +impl Drop for RemoteTimelineClientCallMetricGuard { + fn drop(&mut self) { + if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self { + guard.dec(); + } + } +} + +impl RemoteTimelineClientMetrics { + /// Increment the metrics that track ongoing calls to the remote timeline client instance. + /// + /// Drop the returned guard object once the operation is finished to decrement the values. + /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that + /// is more suitable. + /// Never do both. + pub(crate) fn call_begin( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> RemoteTimelineClientCallMetricGuard { + let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + self.calls_started_hist(file_kind, op_kind) + .observe(unfinished_metric.get() as f64); + unfinished_metric.inc(); + RemoteTimelineClientCallMetricGuard(Some(unfinished_metric)) + } + + /// Manually decrement the metric instead of using the guard object. + /// Using the guard object is generally preferable. + /// See [`call_begin`] for more context. + pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) { + let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + debug_assert!( + unfinished_metric.get() > 0, + "begin and end should cancel out" + ); + unfinished_metric.dec(); + } +} + impl Drop for RemoteTimelineClientMetrics { fn drop(&mut self) { let RemoteTimelineClientMetrics { @@ -587,13 +663,22 @@ impl Drop for RemoteTimelineClientMetrics { timeline_id, remote_physical_size_gauge, remote_operation_time, - unfinished_tasks, + calls_unfinished_gauge, + calls_started_hist, } = self; for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); } - for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() { - let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[ + for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[ tenant_id, timeline_id, a, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b84b2694f4..b266a07337 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,7 +9,7 @@ // custom protocol. // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use bytes::Buf; use bytes::Bytes; use futures::{Stream, StreamExt}; @@ -19,6 +19,8 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; +use pq_proto::ConnectionError; +use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; @@ -26,11 +28,9 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use tokio::pin; -use tokio_util::io::StreamReader; -use tokio_util::io::SyncIoBridge; use tracing::*; use utils::id::ConnectionId; +use utils::postgres_backend_async::QueryError; use utils::{ auth::{Claims, JwtAuth, Scope}, id::{TenantId, TimelineId}, @@ -42,10 +42,9 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; -use crate::config::{PageServerConf, ProfilingConfig}; +use crate::config::PageServerConf; use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; -use crate::profiling::profpoint_start; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::mgr; @@ -64,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { // We were requested to shut down. let msg = format!("pageserver is shutting down"); - let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); - Err(anyhow::anyhow!(msg)) + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None)); + Err(QueryError::Other(anyhow::anyhow!(msg))) } msg = pgb.read_message() => { msg } @@ -78,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { break }, FeMessage::Sync => continue, FeMessage::Terminate => { - let msg = format!("client terminated connection with Terminate message during COPY"); - pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + let msg = "client terminated connection with Terminate message during COPY"; + let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; break; } m => { - let msg = format!("unexpected message {:?}", m); - pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + let msg = format!("unexpected message {m:?}"); + pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?; Err(io::Error::new(io::ErrorKind::Other, msg))?; break; } @@ -95,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; - pgb.write_message(&BeMessage::ErrorResponse(msg))?; + let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } - Err(e) => { - Err(io::Error::new(io::ErrorKind::Other, e))?; + Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(io_error)?; + } + Err(other) => { + Err(io::Error::new(io::ErrorKind::Other, other))?; } }; } @@ -198,23 +202,19 @@ async fn page_service_conn_main( // we've been requested to shut down Ok(()) } - Err(err) => { - let root_cause_io_err_kind = err - .root_cause() - .downcast_ref::() - .map(|e| e.kind()); - + Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { // `ConnectionReset` error happens when the Postgres client closes the connection. // As this disconnection happens quite often and is expected, // we decided to downgrade the logging level to `INFO`. // See: https://github.com/neondatabase/neon/issues/1683. - if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + if io_error.kind() == io::ErrorKind::ConnectionReset { info!("Postgres client disconnected"); Ok(()) } else { - Err(err) + Err(io_error).context("Postgres connection error") } } + other => other.context("Postgres query error"), } } @@ -253,7 +253,7 @@ impl PageRequestMetrics { #[derive(Debug)] struct PageServerHandler { - conf: &'static PageServerConf, + _conf: &'static PageServerConf, auth: Option>, claims: Option, } @@ -261,7 +261,7 @@ struct PageServerHandler { impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { PageServerHandler { - conf, + _conf: conf, auth, claims: None, } @@ -316,7 +316,7 @@ impl PageServerHandler { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, Some(m) => { - bail!("unexpected message: {m:?} during COPY"); + anyhow::bail!("unexpected message: {m:?} during COPY"); } None => break, // client disconnected }; @@ -373,7 +373,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -395,9 +395,7 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_stream = copyin_stream(pgb); - pin!(copyin_stream); - + let mut copyin_stream = Box::pin(copyin_stream(pgb)); timeline .import_basebackup_from_tar(&mut copyin_stream, base_lsn) .await?; @@ -429,11 +427,16 @@ impl PageServerHandler { timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; - ensure!(timeline.get_last_record_lsn() == start_lsn); + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn != start_lsn { + return Err(QueryError::Other( + anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) + ); + } // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. @@ -443,8 +446,8 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyInResponse)?; pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); - let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?; + let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); + import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?; info!("wal import complete"); // Drain the rest of the Copy data @@ -457,7 +460,11 @@ impl PageServerHandler { } // TODO Does it make sense to overshoot? - ensure!(timeline.get_last_record_lsn() >= end_lsn); + if timeline.get_last_record_lsn() < end_lsn { + return Err(QueryError::Other( + anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) + ); + } // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the @@ -486,7 +493,7 @@ impl PageServerHandler { mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RcuReadGuard, - ) -> Result { + ) -> anyhow::Result { if latest { // Latest page version was requested. If LSN is given, it is a hint // to the page server that there have been no modifications to the @@ -517,11 +524,11 @@ impl PageServerHandler { } } else { if lsn == Lsn(0) { - bail!("invalid LSN(0) in request"); + anyhow::bail!("invalid LSN(0) in request"); } timeline.wait_lsn(lsn).await?; } - ensure!( + anyhow::ensure!( lsn >= **latest_gc_cutoff_lsn, "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, **latest_gc_cutoff_lsn @@ -534,7 +541,7 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamExistsRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; @@ -554,7 +561,7 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamNblocksRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; @@ -574,7 +581,7 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; @@ -595,7 +602,7 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamGetPageRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; @@ -609,10 +616,6 @@ impl PageServerHandler { */ let page = crate::tenant::with_ondemand_download(|| { - // FIXME: this profiling now happens at different place than it used to. The - // current profiling is based on a thread-local variable, so it doesn't work - // across awaits - let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) }) .await?; @@ -649,16 +652,12 @@ impl PageServerHandler { pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - let mut writer = CopyDataSink { - pgb, - rt: tokio::runtime::Handle::current(), - }; - tokio::task::block_in_place(|| { - let basebackup = - basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str()); - basebackup.send_tarball() - })?; + { + let mut writer = pgb.copyout_writer(); + basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup) + .await?; + } + pgb.write_message(&BeMessage::CopyDone)?; pgb.flush().await?; info!("basebackup complete"); @@ -668,7 +667,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -690,20 +689,19 @@ impl postgres_backend_async::Handler for PageServerHandler { &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth .as_ref() .unwrap() - .decode(str::from_utf8(jwt_response)?)?; + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; - if matches!(data.claims.scope, Scope::Tenant) { - ensure!( - data.claims.tenant_id.is_some(), + if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { + return Err(QueryError::Other(anyhow::anyhow!( "jwt token scope is Tenant, but tenant id is missing" - ) + ))); } info!( @@ -715,22 +713,33 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) } + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { + Ok(()) + } + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> anyhow::Result<()> { - debug!("process query {:?}", query_string); + ) -> Result<(), QueryError> { + debug!("process query {query_string:?}"); if query_string.starts_with("pagestream ") { let (_, params_raw) = query_string.split_at("pagestream ".len()); let params = params_raw.split(' ').collect::>(); - ensure!( - params.len() == 2, - "invalid param number for pagestream command" - ); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for pagestream command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; @@ -740,18 +749,24 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() >= 2, - "invalid param number for basebackup command" - ); + if params.len() < 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for basebackup command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { - Some(Lsn::from_str(params[2])?) + Some( + Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + ) } else { None }; @@ -766,13 +781,16 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() == 2, - "invalid param number for get_last_record_rlsn command" - ); + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for get_last_record_rlsn command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; @@ -794,22 +812,31 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("fullbackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() >= 2, - "invalid param number for fullbackup command" - ); + if params.len() < 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for fullbackup command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { - Some(Lsn::from_str(params[2])?) + Some( + Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + ) } else { None }; let prev_lsn = if params.len() > 3 { - Some(Lsn::from_str(params[3])?) + Some( + Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + ) } else { None }; @@ -834,12 +861,21 @@ impl postgres_backend_async::Handler for PageServerHandler { // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 5); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; - let base_lsn = Lsn::from_str(params[2])?; - let end_lsn = Lsn::from_str(params[3])?; - let pg_version = u32::from_str(params[4])?; + if params.len() != 5 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for import basebackup command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + let base_lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + let end_lsn = Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; + let pg_version = u32::from_str(params[4]) + .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; self.check_permission(Some(tenant_id))?; @@ -857,7 +893,10 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? } }; } else if query_string.starts_with("import wal ") { @@ -867,11 +906,19 @@ impl postgres_backend_async::Handler for PageServerHandler { // caller should poll the http api to check when that is done. let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; - let start_lsn = Lsn::from_str(params[2])?; - let end_lsn = Lsn::from_str(params[3])?; + if params.len() != 4 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for import wal command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + let start_lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + let end_lsn = Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; self.check_permission(Some(tenant_id))?; @@ -882,7 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { @@ -893,8 +943,13 @@ impl postgres_backend_async::Handler for PageServerHandler { // show let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); - ensure!(params.len() == 1, "invalid param number for config command"); - let tenant_id = TenantId::from_str(params[0])?; + if params.len() != 1 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for config command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; self.check_permission(Some(tenant_id))?; @@ -935,7 +990,9 @@ impl postgres_backend_async::Handler for PageServerHandler { ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { - bail!("unknown command"); + return Err(QueryError::Other(anyhow::anyhow!( + "unknown command {query_string}" + ))); } Ok(()) @@ -947,7 +1004,7 @@ impl postgres_backend_async::Handler for PageServerHandler { /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That /// ensures that queries don't fail immediately after pageserver startup, because /// all tenants are still loading. -async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result> { +async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result> { let tenant = mgr::get_tenant(tenant_id, false).await?; match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await { Ok(wait_result) => wait_result @@ -961,37 +1018,8 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result Result> { +) -> anyhow::Result> { get_active_tenant_with_timeout(tenant_id) .await .and_then(|tenant| tenant.get_timeline(timeline_id, true)) } - -/// -/// A std::io::Write implementation that wraps all data written to it in CopyData -/// messages. -/// -struct CopyDataSink<'a> { - pgb: &'a mut PostgresBackend, - rt: tokio::runtime::Handle, -} - -impl<'a> io::Write for CopyDataSink<'a> { - fn write(&mut self, data: &[u8]) -> io::Result { - // CopyData - // FIXME: if the input is large, we should split it into multiple messages. - // Not sure what the threshold should be, but the ultimate hard limit is that - // the length cannot exceed u32. - // FIXME: flush isn't really required, but makes it easier - // to view in wireshark - self.pgb.write_message(&BeMessage::CopyData(data))?; - self.rt.block_on(self.pgb.flush())?; - trace!("CopyData sent for {} bytes!", data.len()); - - Ok(data.len()) - } - fn flush(&mut self) -> io::Result<()> { - // no-op - Ok(()) - } -} diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs deleted file mode 100644 index ad896cfa30..0000000000 --- a/pageserver/src/profiling.rs +++ /dev/null @@ -1,107 +0,0 @@ -//! -//! Support for profiling -//! -//! This relies on a modified version of the 'pprof-rs' crate. That's not very -//! nice, so to avoid a hard dependency on that, this is an optional feature. -//! -use crate::config::{PageServerConf, ProfilingConfig}; - -/// The actual implementation is in the `profiling_impl` submodule. If the profiling -/// feature is not enabled, it's just a dummy implementation that panics if you -/// try to enabled profiling in the configuration. -pub use profiling_impl::*; - -#[cfg(feature = "profiling")] -mod profiling_impl { - use super::*; - use pprof; - use std::marker::PhantomData; - - /// Start profiling the current thread. Returns a guard object; - /// the profiling continues until the guard is dropped. - /// - /// Note: profiling is not re-entrant. If you call 'profpoint_start' while - /// profiling is already started, nothing happens, and the profiling will be - /// stopped when either guard object is dropped. - #[inline] - pub fn profpoint_start( - conf: &crate::config::PageServerConf, - point: ProfilingConfig, - ) -> Option { - if conf.profiling == point { - pprof::start_profiling(); - Some(ProfilingGuard(PhantomData)) - } else { - None - } - } - - /// A hack to remove Send and Sync from the ProfilingGuard. Because the - /// profiling is attached to current thread. - //// - /// See comments in https://github.com/rust-lang/rust/issues/68318 - type PhantomUnsend = std::marker::PhantomData<*mut u8>; - - pub struct ProfilingGuard(PhantomUnsend); - - impl Drop for ProfilingGuard { - fn drop(&mut self) { - pprof::stop_profiling(); - } - } - - /// Initialize the profiler. This must be called before any 'profpoint_start' calls. - pub fn init_profiler(conf: &PageServerConf) -> Option { - if conf.profiling != ProfilingConfig::Disabled { - Some(pprof::ProfilerGuardBuilder::default().build().unwrap()) - } else { - None - } - } - - /// Exit the profiler. Writes the flamegraph to current workdir. - pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option) { - // Write out the flamegraph - if let Some(profiler_guard) = profiler_guard { - if let Ok(report) = profiler_guard.report().build() { - // this gets written under the workdir - let file = std::fs::File::create("flamegraph.svg").unwrap(); - let mut options = pprof::flamegraph::Options::default(); - options.image_width = Some(2500); - report.flamegraph_with_options(file, &mut options).unwrap(); - } - } - } -} - -/// Dummy implementation when compiling without profiling feature or for non-linux OSes. -#[cfg(not(feature = "profiling"))] -mod profiling_impl { - use super::*; - - pub struct DummyProfilerGuard; - - impl Drop for DummyProfilerGuard { - fn drop(&mut self) { - // do nothing, this exists to calm Clippy down - } - } - - pub fn profpoint_start( - _conf: &PageServerConf, - _point: ProfilingConfig, - ) -> Option { - None - } - - pub fn init_profiler(conf: &PageServerConf) -> Option { - if conf.profiling != ProfilingConfig::Disabled { - // shouldn't happen, we don't allow profiling in the config if the support - // for it is disabled. - panic!("profiling enabled but the binary was compiled without profiling support"); - } - None - } - - pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option) {} -} diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c4ee795b07..269f7354bf 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -13,13 +13,13 @@ use anyhow::{bail, Context}; use bytes::Bytes; +use futures::FutureExt; use futures::Stream; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use tokio::sync::watch; -use tokio_util::io::StreamReader; -use tokio_util::io::SyncIoBridge; +use tokio::task::JoinSet; use tracing::*; use utils::crashsafe::path_with_suffix_extension; @@ -36,7 +36,6 @@ use std::io::Write; use std::ops::Bound::Included; use std::path::Path; use std::path::PathBuf; -use std::pin::Pin; use std::process::Command; use std::process::Stdio; use std::sync::Arc; @@ -239,21 +238,15 @@ impl UninitializedTimeline<'_> { /// Prepares timeline data by loading it from the basebackup archive. pub async fn import_basebackup_from_tar( self, - mut copyin_stream: &mut Pin<&mut impl Stream>>, + copyin_stream: &mut (impl Stream> + Sync + Send + Unpin), base_lsn: Lsn, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; - // import_basebackup_from_tar() is not async, mainly because the Tar crate - // it uses is not async. So we need to jump through some hoops: - // - convert the input from client connection to a synchronous Read - // - use block_in_place() - let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - - tokio::task::block_in_place(|| { - import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn) - .context("Failed to import basebackup") - })?; + let mut reader = tokio_util::io::StreamReader::new(copyin_stream); + import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn) + .await + .context("Failed to import basebackup")?; // Flush loop needs to be spawned in order to be able to flush. // We want to run proper checkpoint before we mark timeline as available to outside world @@ -606,7 +599,7 @@ impl Tenant { match tenant_clone.attach().await { Ok(_) => {} Err(e) => { - tenant_clone.set_broken(); + tenant_clone.set_broken(&e.to_string()); error!("error attaching tenant: {:?}", e); } } @@ -651,26 +644,62 @@ impl Tenant { .as_ref() .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?; - let remote_timelines = remote_timeline_client::list_remote_timelines( + let remote_timeline_ids = remote_timeline_client::list_remote_timelines( remote_storage, self.conf, self.tenant_id, ) .await?; - info!("found {} timelines", remote_timelines.len()); + info!("found {} timelines", remote_timeline_ids.len()); - let mut timeline_ancestors: HashMap = HashMap::new(); - let mut index_parts: HashMap = HashMap::new(); - for (timeline_id, index_part) in remote_timelines { - let remote_metadata = index_part.parse_metadata().with_context(|| { - format!( - "Failed to parse metadata file from remote storage for tenant {} timeline {}", - self.tenant_id, timeline_id - ) - })?; + // Download & parse index parts + let mut part_downloads = JoinSet::new(); + for timeline_id in remote_timeline_ids { + let client = RemoteTimelineClient::new( + remote_storage.clone(), + self.conf, + self.tenant_id, + timeline_id, + ); + part_downloads.spawn( + async move { + debug!("starting index part download"); + + let index_part = client + .download_index_file() + .await + .context("download index file")?; + + let remote_metadata = index_part.parse_metadata().context("parse metadata")?; + + debug!("finished index part download"); + + Result::<_, anyhow::Error>::Ok(( + timeline_id, + client, + index_part, + remote_metadata, + )) + } + .map(move |res| { + res.with_context(|| format!("download index part for timeline {timeline_id}")) + }) + .instrument(info_span!("download_index_part", timeline=%timeline_id)), + ); + } + // Wait for all the download tasks to complete & collect results. + let mut remote_clients = HashMap::new(); + let mut index_parts = HashMap::new(); + let mut timeline_ancestors = HashMap::new(); + while let Some(result) = part_downloads.join_next().await { + // NB: we already added timeline_id as context to the error + let result: Result<_, anyhow::Error> = result.context("joinset task join")?; + let (timeline_id, client, index_part, remote_metadata) = result?; + debug!("successfully downloaded index part for timeline {timeline_id}"); timeline_ancestors.insert(timeline_id, remote_metadata); index_parts.insert(timeline_id, index_part); + remote_clients.insert(timeline_id, client); } // For every timeline, download the metadata file, scan the local directory, @@ -683,7 +712,7 @@ impl Tenant { timeline_id, index_parts.remove(&timeline_id).unwrap(), remote_metadata, - remote_storage.clone(), + remote_clients.remove(&timeline_id).unwrap(), ) .await .with_context(|| { @@ -726,22 +755,19 @@ impl Tenant { Ok(size) } - #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))] + #[instrument(skip_all, fields(timeline_id=%timeline_id))] async fn load_remote_timeline( &self, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, - remote_storage: GenericRemoteStorage, + remote_client: RemoteTimelineClient, ) -> anyhow::Result<()> { info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) .await .context("Failed to create new timeline directory")?; - let remote_client = - RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?; - let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() { let timelines = self.timelines.lock().unwrap(); Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else( @@ -837,7 +863,7 @@ impl Tenant { match tenant_clone.load().await { Ok(()) => {} Err(err) => { - tenant_clone.set_broken(); + tenant_clone.set_broken(&err.to_string()); error!("could not load tenant {tenant_id}: {err:?}"); } } @@ -998,18 +1024,14 @@ impl Tenant { None }; - let remote_client = self - .remote_storage - .as_ref() - .map(|remote_storage| { - RemoteTimelineClient::new( - remote_storage.clone(), - self.conf, - self.tenant_id, - timeline_id, - ) - }) - .transpose()?; + let remote_client = self.remote_storage.as_ref().map(|remote_storage| { + RemoteTimelineClient::new( + remote_storage.clone(), + self.conf, + self.tenant_id, + timeline_id, + ) + }); let remote_startup_data = match &remote_client { Some(remote_client) => match remote_client.download_index_file().await { @@ -1477,7 +1499,7 @@ impl Tenant { }); } - pub fn set_broken(&self) { + pub fn set_broken(&self, reason: &str) { self.state.send_modify(|current_state| { match *current_state { TenantState::Active => { @@ -1486,18 +1508,22 @@ impl Tenant { // activated should never be marked as broken. We cope with it the best // we can, but it shouldn't happen. *current_state = TenantState::Broken; - warn!("Changing Active tenant to Broken state"); + warn!("Changing Active tenant to Broken state, reason: {}", reason); } TenantState::Broken => { // This shouldn't happen either - warn!("Tenant is already broken"); + warn!("Tenant is already in Broken state"); } TenantState::Stopping => { // This shouldn't happen either *current_state = TenantState::Broken; - warn!("Marking Stopping tenant as Broken"); + warn!( + "Marking Stopping tenant as Broken state, reason: {}", + reason + ); } TenantState::Loading | TenantState::Attaching => { + info!("Setting tenant as Broken state, reason: {}", reason); *current_state = TenantState::Broken; } } @@ -1851,7 +1877,12 @@ impl Tenant { utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); - info!("starting on {} timelines", gc_timelines.len()); + // If there is nothing to GC, we don't want any messages in the INFO log. + if !gc_timelines.is_empty() { + info!("{} timelines need GC", gc_timelines.len()); + } else { + debug!("{} timelines need GC", gc_timelines.len()); + } // Perform GC for each timeline. // @@ -2142,13 +2173,12 @@ impl Tenant { let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; - tokio::task::block_in_place(|| { - import_datadir::import_timeline_from_postgres_datadir( - unfinished_timeline, - pgdata_path, - pgdata_lsn, - ) - }) + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + .await .with_context(|| { format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") })?; @@ -2204,7 +2234,7 @@ impl Tenant { self.conf, tenant_id, new_timeline_id, - )?; + ); remote_client.init_upload_queue_for_empty_remote(&new_metadata)?; Some(remote_client) } else { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 8569c70217..c95a98fbc7 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -30,7 +30,7 @@ pub mod defaults { pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 50653820fd..0fa2344f75 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -87,6 +87,9 @@ where /// contain the version, even if it's missing from the returned /// layer. /// + /// NOTE: This only searches the 'historic' layers, *not* the + /// 'open' and 'frozen' layers! + /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { match self.index.query(key.to_i128(), end_lsn.0 - 1) { (None, None) => None, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 44849de735..dce7cd8bae 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -430,7 +430,7 @@ where Err(e) => { let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => tenant.set_broken(), + Some(tenant) => tenant.set_broken(&e.to_string()), None => warn!("Tenant {tenant_id} got removed from memory"), } Err(e) @@ -492,3 +492,53 @@ pub async fn immediate_gc( Ok(wait_task_done) } + +#[cfg(feature = "testing")] +pub async fn immediate_compact( + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result>, ApiError> { + let guard = TENANTS.read().await; + + let tenant = guard + .get(&tenant_id) + .map(Arc::clone) + .with_context(|| format!("Tenant {tenant_id} not found")) + .map_err(ApiError::NotFound)?; + + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + + // Run in task_mgr to avoid race with detach operation + let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::Compaction, + Some(tenant_id), + Some(timeline_id), + &format!( + "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}" + ), + false, + async move { + let result = timeline + .compact() + .instrument( + info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id), + ) + .await; + + match task_done.send(result) { + Ok(_) => (), + Err(result) => error!("failed to send compaction result: {result:?}"), + } + Ok(()) + }, + ); + + // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task + drop(guard); + + Ok(wait_task_done) +} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 45988ff47a..1db69d8b73 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -298,8 +298,8 @@ impl RemoteTimelineClient { conf: &'static PageServerConf, tenant_id: TenantId, timeline_id: TimelineId, - ) -> anyhow::Result { - Ok(RemoteTimelineClient { + ) -> RemoteTimelineClient { + RemoteTimelineClient { conf, runtime: &BACKGROUND_RUNTIME, tenant_id, @@ -307,7 +307,7 @@ impl RemoteTimelineClient { storage_impl: remote_storage, upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), - }) + } } /// Initialize the upload queue for a remote storage that already received @@ -367,6 +367,10 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file(&self) -> Result { + let _unfinished_gauge_guard = self + .metrics + .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download); + download::download_index_part( self.conf, &self.storage_impl, @@ -393,22 +397,27 @@ impl RemoteTimelineClient { layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { - let downloaded_size = download::download_layer_file( - self.conf, - &self.storage_impl, - self.tenant_id, - self.timeline_id, - layer_file_name, - layer_metadata, - ) - .measure_remote_op( - self.tenant_id, - self.timeline_id, - RemoteOpFileKind::Layer, - RemoteOpKind::Download, - Arc::clone(&self.metrics), - ) - .await?; + let downloaded_size = { + let _unfinished_gauge_guard = self + .metrics + .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download); + download::download_layer_file( + self.conf, + &self.storage_impl, + self.tenant_id, + self.timeline_id, + layer_file_name, + layer_metadata, + ) + .measure_remote_op( + self.tenant_id, + self.timeline_id, + RemoteOpFileKind::Layer, + RemoteOpKind::Download, + Arc::clone(&self.metrics), + ) + .await? + }; // Update the metadata for given layer file. The remote index file // might be missing some information for the file; this allows us @@ -517,7 +526,7 @@ impl RemoteTimelineClient { metadata_bytes, ); let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -549,7 +558,7 @@ impl RemoteTimelineClient { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone()); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); info!( @@ -601,7 +610,7 @@ impl RemoteTimelineClient { // schedule the actual deletions for name in names { let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone()); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); info!("scheduled layer file deletion {}", name.file_name()); } @@ -753,7 +762,7 @@ impl RemoteTimelineClient { // upload finishes or times out soon enough. if task_mgr::is_shutdown_requested() { info!("upload task cancelled by shutdown request"); - self.update_upload_queue_unfinished_metric(-1, &task.op); + self.calls_unfinished_metric_end(&task.op); self.stop(); return; } @@ -901,22 +910,40 @@ impl RemoteTimelineClient { // Launch any queued tasks that were unblocked by this one. self.launch_queued_tasks(upload_queue); } - self.update_upload_queue_unfinished_metric(-1, &task.op); + self.calls_unfinished_metric_end(&task.op); } - fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) { - let (file_kind, op_kind) = match op { + fn calls_unfinished_metric_impl( + &self, + op: &UploadOp, + ) -> Option<(RemoteOpFileKind, RemoteOpKind)> { + let res = match op { UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload), UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload), UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete), UploadOp::Barrier(_) => { // we do not account these - return; + return None; } }; - self.metrics - .unfinished_tasks(&file_kind, &op_kind) - .add(delta) + Some(res) + } + + fn calls_unfinished_metric_begin(&self, op: &UploadOp) { + let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + Some(x) => x, + None => return, + }; + let guard = self.metrics.call_begin(&file_kind, &op_kind); + guard.will_decrement_manually(); // in unfinished_ops_metric_end() + } + + fn calls_unfinished_metric_end(&self, op: &UploadOp) { + let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + Some(x) => x, + None => return, + }; + self.metrics.call_end(&file_kind, &op_kind); } fn stop(&self) { @@ -967,7 +994,7 @@ impl RemoteTimelineClient { // Tear down queued ops for op in qi.queued_operations.into_iter() { - self.update_upload_queue_unfinished_metric(-1, &op); + self.calls_unfinished_metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 422728d1f3..2e79698087 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,10 +8,9 @@ use std::future::Future; use std::path::Path; use anyhow::{anyhow, Context}; -use futures::stream::{FuturesUnordered, StreamExt}; use tokio::fs; use tokio::io::AsyncWriteExt; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use tracing::{error, info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; @@ -175,7 +174,7 @@ pub async fn list_remote_timelines<'a>( storage: &'a GenericRemoteStorage, conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result> { +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); let tenant_storage_path = conf.remote_path(&tenant_path)?; @@ -194,7 +193,6 @@ pub async fn list_remote_timelines<'a>( } let mut timeline_ids = HashSet::new(); - let mut part_downloads = FuturesUnordered::new(); for timeline_remote_storage_key in timelines { let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { @@ -205,35 +203,22 @@ pub async fn list_remote_timelines<'a>( format!("failed to parse object name into timeline id '{object_name}'") })?; - // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID - // yet, launch a download task for it. - if !timeline_ids.contains(&timeline_id) { - timeline_ids.insert(timeline_id); - let storage_clone = storage.clone(); - part_downloads.push(async move { - ( - timeline_id, - download_index_part(conf, &storage_clone, tenant_id, timeline_id) - .instrument(info_span!("download_index_part", timeline=%timeline_id)) - .await, - ) - }); - } + // list_prefixes is assumed to return unique names. Ensure this here. + // NB: it's safer to bail out than warn-log this because the pageserver + // needs to absolutely know about _all_ timelines that exist, so that + // GC knows all the branchpoints. If we skipped over a timeline instead, + // GC could delete a layer that's still needed by that timeline. + anyhow::ensure!( + !timeline_ids.contains(&timeline_id), + "list_prefixes contains duplicate timeline id {timeline_id}" + ); + timeline_ids.insert(timeline_id); } - // Wait for all the download tasks to complete. - let mut timeline_parts = Vec::new(); - while let Some((timeline_id, part_upload_result)) = part_downloads.next().await { - let index_part = part_upload_result - .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?; - - debug!("Successfully fetched index part for timeline {timeline_id}"); - timeline_parts.push((timeline_id, index_part)); - } - Ok(timeline_parts) + Ok(timeline_ids) } -pub async fn download_index_part( +pub(super) async fn download_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, tenant_id: TenantId, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 651c8116f5..c2a89cd6bb 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1655,8 +1655,7 @@ impl Timeline { // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. - let mut traversal_path = - Vec::<(ValueReconstructResult, Lsn, Box)>::new(); + let mut traversal_path = Vec::::new(); let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { *cached_lsn @@ -1721,82 +1720,132 @@ impl Timeline { timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); - continue; + continue 'outer; } - let layers = timeline.layers.read().unwrap(); + #[allow(clippy::never_loop)] // see comment at bottom of this loop + '_layer_map_search: loop { + let remote_layer = { + let layers = timeline.layers.read().unwrap(); - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = match open_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ) { - Ok(result) => result, - Err(e) => return PageReconstructResult::from(e), - }; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, Box::new(open_layer.clone()))); - continue; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = match frozen_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ) { - Ok(result) => result, - Err(e) => return PageReconstructResult::from(e), - }; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone()))); - continue 'outer; - } - } + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = match open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let open_layer = Arc::clone(open_layer); + move || open_layer.traversal_id() + }), + )); + continue 'outer; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = match frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let frozen_layer = Arc::clone(frozen_layer); + move || frozen_layer.traversal_id() + }), + )); + continue 'outer; + } + } - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - - // If it's a remote layer, the caller can do the download and retry. - if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) { - info!("need remote layer {}", layer.traversal_id()); - return PageReconstructResult::NeedsDownload( - Weak::clone(&timeline.myself), - Arc::downgrade(&remote_layer), - ); - } - - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = match layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ) { - Ok(result) => result, - Err(e) => return PageReconstructResult::from(e), + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { + // If it's a remote layer, download it and retry. + if let Some(remote_layer) = + super::storage_layer::downcast_remote_layer(&layer) + { + // TODO: push a breadcrumb to 'traversal_path' to record the fact that + // we downloaded / would need to download this layer. + remote_layer // download happens outside the scope of `layers` guard object + } else { + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = match layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let layer = Arc::clone(&layer); + move || layer.traversal_id() + }), + )); + continue 'outer; + } + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + continue 'outer; + } else { + // Nothing found + result = ValueReconstructResult::Missing; + continue 'outer; + } }; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, Box::new(layer.clone()))); - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - } else { - // Nothing found - result = ValueReconstructResult::Missing; + // Indicate to the caller that we need remote_layer replaced with a downloaded + // layer in the layer map. The control flow could be a lot simpler, but the point + // of this commit is to prepare this function to + // 1. become async + // 2. do the download right here, using + // ``` + // download_remote_layer().await?; + // continue 'layer_map_search; + // ``` + // For (2), current rustc requires that the layers lock guard is not in scope. + // Hence, the complicated control flow. + let remote_layer_as_persistent: Arc = + Arc::clone(&remote_layer) as Arc; + info!( + "need remote layer {}", + remote_layer_as_persistent.traversal_id() + ); + return PageReconstructResult::NeedsDownload( + Weak::clone(&timeline.myself), + Arc::downgrade(&remote_layer), + ); } } } @@ -3376,22 +3425,25 @@ where } } +type TraversalPathItem = ( + ValueReconstructResult, + Lsn, + Box TraversalId>, +); + /// Helper function for get_reconstruct_data() to add the path of layers traversed /// to an error, as anyhow context information. -fn layer_traversal_error( - msg: String, - path: Vec<(ValueReconstructResult, Lsn, Box)>, -) -> PageReconstructResult<()> { +fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructResult<()> { // We want the original 'msg' to be the outermost context. The outermost context // is the most high-level information, which also gets propagated to the client. let mut msg_iter = path - .iter() + .into_iter() .map(|(r, c, l)| { format!( "layer traversal: result {:?}, cont_lsn {}, layer: {}", r, c, - l.traversal_id(), + l(), ) }) .chain(std::iter::once(msg)); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 031b80a6e0..1c974f7e2a 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,7 +21,6 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. -use anyhow::Context; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; @@ -31,12 +30,10 @@ use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::pgdatadir_mapping::*; -use crate::tenant::PageReconstructResult; use crate::tenant::Timeline; -use crate::try_page_reconstruct_result as try_prr; +use crate::tenant::{with_ondemand_download, PageReconstructError}; use crate::walrecord::*; use crate::ZERO_PAGE; -use crate::{try_no_ondemand_download, try_page_reconstruct_result}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -55,16 +52,15 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult { + pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint)); - let checkpoint = try_page_reconstruct_result!( - CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes") - ); + let checkpoint_bytes = + with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?; + let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); - PageReconstructResult::Success(WalIngest { + Ok(WalIngest { timeline, checkpoint, checkpoint_modified: false, @@ -79,18 +75,15 @@ impl<'a> WalIngest<'a> { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// - pub fn ingest_record( + pub async fn ingest_record( &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, - ) -> PageReconstructResult<()> { + ) -> anyhow::Result<()> { modification.lsn = lsn; - try_prr!( - decode_wal_record(recdata, decoded, self.timeline.pg_version) - .context("failed decoding wal record") - ); + decode_wal_record(recdata, decoded, self.timeline.pg_version)?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -105,7 +98,8 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded)); + self.ingest_heapam_record(&mut buf, modification, decoded) + .await?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -113,13 +107,14 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - try_prr!(self.ingest_xlog_smgr_create(modification, &create)); + self.ingest_xlog_smgr_create(modification, &create)?; } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate)); + self.ingest_xlog_smgr_truncate(modification, &truncate) + .await?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( "handle RM_DBASE_ID for Postgres version {:?}", @@ -132,14 +127,15 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); + self.ingest_xlog_dbase_create(modification, &createdb) + .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; } } } else if self.timeline.pg_version == 15 { @@ -155,14 +151,15 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); + self.ingest_xlog_dbase_create(modification, &createdb) + .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; } } } @@ -174,38 +171,42 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - try_prr!(self.put_slru_page_image( + self.put_slru_page_image( modification, SlruKind::Clog, segno, rpageno, ZERO_PAGE.clone(), - )); + ) + .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - try_prr!(self.ingest_clog_truncate_record(modification, &xlrec)); + self.ingest_clog_truncate_record(modification, &xlrec) + .await?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - try_prr!(self.ingest_xact_record( + self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, - )); + ) + .await?; } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED || info == pg_constants::XLOG_XACT_ABORT_PREPARED { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - try_prr!(self.ingest_xact_record( + self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - )); + ) + .await?; // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", @@ -213,10 +214,9 @@ impl<'a> WalIngest<'a> { parsed_xact.xid, lsn, ); - try_prr!(modification.drop_twophase_file(parsed_xact.xid)); + modification.drop_twophase_file(parsed_xact.xid)?; } else if info == pg_constants::XLOG_XACT_PREPARE { - try_prr!(modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))); + modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -225,34 +225,36 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - try_prr!(self.put_slru_page_image( + self.put_slru_page_image( modification, SlruKind::MultiXactOffsets, segno, rpageno, ZERO_PAGE.clone(), - )); + ) + .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - try_prr!(self.put_slru_page_image( + self.put_slru_page_image( modification, SlruKind::MultiXactMembers, segno, rpageno, ZERO_PAGE.clone(), - )); + ) + .await?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - try_prr!(self.ingest_multixact_create_record(modification, &xlrec)); + self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec)); + self.ingest_multixact_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded)); + self.ingest_relmap_page(modification, &xlrec, decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -266,9 +268,7 @@ impl<'a> WalIngest<'a> { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = try_prr!( - CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint") - ); + let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, @@ -289,32 +289,32 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk)); + self.ingest_decoded_block(modification, lsn, decoded, blk) + .await?; } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = - try_prr!(self.checkpoint.encode().context("encode checkpoint")); + let new_checkpoint_bytes = self.checkpoint.encode()?; - try_prr!(modification.put_checkpoint(new_checkpoint_bytes)); + modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - try_prr!(modification.commit()); + modification.commit()?; - PageReconstructResult::Success(()) + Ok(()) } - fn ingest_decoded_block( + async fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, - ) -> PageReconstructResult<()> { + ) -> Result<(), PageReconstructError> { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, @@ -334,7 +334,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)) + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -356,28 +356,25 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - try_no_ondemand_download!(self.put_rel_page_image( - modification, - rel, - blk.blkno, - image.freeze() - )); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze()) + .await?; } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec)); + self.put_rel_wal_record(modification, rel, blk.blkno, rec) + .await?; } - PageReconstructResult::Success(()) + Ok(()) } - fn ingest_heapam_record( + async fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Handle VM bit updates that are implicitly part of heap records. // First, look at the record to determine which VM bits need @@ -456,7 +453,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn)?; + let vm_size = self.get_relsize(vm_rel, modification.lsn).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -481,7 +478,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } else { // Clear VM bits for one heap page, or for two pages that reside on // different VM pages. @@ -495,7 +493,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } if let Some(old_vm_blk) = old_vm_blk { self.put_rel_wal_record( @@ -507,7 +506,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } } } @@ -517,9 +517,9 @@ impl<'a> WalIngest<'a> { } /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. - fn ingest_xlog_dbase_create( + async fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rec: &XlCreateDatabase, ) -> anyhow::Result<()> { let db_id = rec.db_id; @@ -534,18 +534,22 @@ impl<'a> WalIngest<'a> { // get calls instead. let req_lsn = modification.tline.get_last_record_lsn(); - let rels = modification - .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn) - .no_ondemand_download()?; + let rels = with_ondemand_download(|| { + modification + .tline + .list_rels(src_tablespace_id, src_db_id, req_lsn) + }) + .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap - let filemap = modification - .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) - .no_ondemand_download()?; + let filemap = with_ondemand_download(|| { + modification + .tline + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + }) + .await?; modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; @@ -554,10 +558,9 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification - .tline - .get_rel_size(src_rel, req_lsn, true) - .no_ondemand_download()?; + let nblocks = + with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true)) + .await?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -572,10 +575,12 @@ impl<'a> WalIngest<'a> { for blknum in 0..nblocks { debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - let content = modification - .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) - .no_ondemand_download()?; + let content = with_ondemand_download(|| { + modification + .tline + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + }) + .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -594,7 +599,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, rec: &XlSmgrCreate, - ) -> Result<()> { + ) -> anyhow::Result<()> { let rel = RelTag { spcnode: rec.rnode.spcnode, dbnode: rec.rnode.dbnode, @@ -608,11 +613,11 @@ impl<'a> WalIngest<'a> { /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. - fn ingest_xlog_smgr_truncate( + async fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rec: &XlSmgrTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { let spcnode = rec.rnode.spcnode; let dbnode = rec.rnode.dbnode; let relnode = rec.rnode.relnode; @@ -642,7 +647,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn)?; + let nblocks = self.get_relsize(rel, modification.lsn).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position self.put_rel_truncation(modification, rel, fsm_physical_page_no)?; @@ -663,7 +668,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn)?; + let nblocks = self.get_relsize(rel, modification.lsn).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position self.put_rel_truncation(modification, rel, vm_page_no)?; @@ -674,9 +679,9 @@ impl<'a> WalIngest<'a> { /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// - fn ingest_xact_record( + async fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, ) -> anyhow::Result<()> { @@ -735,10 +740,8 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification - .tline - .get_rel_exists(rel, last_lsn, true) - .no_ondemand_download()? + if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true)) + .await? { self.put_rel_drop(modification, rel)?; } @@ -747,9 +750,9 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn ingest_clog_truncate_record( + async fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, xlrec: &XlClogTruncate, ) -> anyhow::Result<()> { info!( @@ -791,11 +794,14 @@ impl<'a> WalIngest<'a> { // it. So we use the previous record's LSN in the get calls // instead. let req_lsn = modification.tline.get_last_record_lsn(); - for segno in modification - .tline - .list_slru_segments(SlruKind::Clog, req_lsn) - .no_ondemand_download()? - { + + let slru_segments = with_ondemand_download(|| { + modification + .tline + .list_slru_segments(SlruKind::Clog, req_lsn) + }) + .await?; + for segno in slru_segments { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { modification.drop_slru_segment(SlruKind::Clog, segno)?; @@ -944,27 +950,26 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn put_rel_page_image( + async fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> PageReconstructResult<()> { - try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum)); - try_prr!(modification.put_rel_page_image(rel, blknum, img)); - PageReconstructResult::Success(()) + ) -> anyhow::Result<()> { + self.handle_rel_extend(modification, rel, blknum).await?; + modification.put_rel_page_image(rel, blknum, img)?; + Ok(()) } - fn put_rel_wal_record( + async fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum) - .no_ondemand_download()?; + ) -> anyhow::Result<()> { + self.handle_rel_extend(modification, rel, blknum).await?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -984,69 +989,67 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { - let nblocks = if !self - .timeline - .get_rel_exists(rel, lsn, true) - .no_ondemand_download()? - { + async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { + let exists = + with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?; + let nblocks = if !exists { 0 } else { - self.timeline - .get_rel_size(rel, lsn, true) - .no_ondemand_download()? + with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await? }; Ok(nblocks) } - fn handle_rel_extend( + async fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, - ) -> PageReconstructResult<()> { + ) -> anyhow::Result<()> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; let old_nblocks = - if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) { + if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await? + { // create it with 0 size initially, the logic below will extend it - try_prr!(modification.put_rel_creation(rel, 0)); + modification.put_rel_creation(rel, 0)?; 0 } else { - try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true)) + with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await? }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); - try_prr!(modification.put_rel_extend(rel, new_nblocks)); + modification.put_rel_extend(rel, new_nblocks)?; // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())); + modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; } } - PageReconstructResult::Success(()) + Ok(()) } - fn put_slru_page_image( + async fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_slru_extend(modification, kind, segno, blknum)?; + ) -> anyhow::Result<()> { + self.handle_slru_extend(modification, kind, segno, blknum) + .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; Ok(()) } - fn handle_slru_extend( + async fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1060,18 +1063,17 @@ impl<'a> WalIngest<'a> { // record. // TODO: would be nice if to be more explicit about it let last_lsn = self.timeline.get_last_record_lsn(); - let old_nblocks = if !self - .timeline - .get_slru_segment_exists(kind, segno, last_lsn) - .no_ondemand_download()? + let old_nblocks = if !with_ondemand_download(|| { + self.timeline.get_slru_segment_exists(kind, segno, last_lsn) + }) + .await? { // create it with 0 size initially, the logic below will extend it modification.put_slru_segment_creation(kind, segno, 0)?; 0 } else { - self.timeline - .get_slru_segment_size(kind, segno, last_lsn) - .no_ondemand_download()? + with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn)) + .await? }; if new_nblocks > old_nblocks { @@ -1119,12 +1121,12 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &Timeline) -> Result { + async fn init_walingest_test(tline: &Timeline) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file m.commit()?; - let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?; + let walingest = WalIngest::new(tline, Lsn(0x10)).await?; Ok(walingest) } @@ -1133,28 +1135,28 @@ mod tests { async fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) - .no_ondemand_download()?; + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) - .no_ondemand_download()?; + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) - .no_ondemand_download()?; + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) - .no_ondemand_download()?; + .await?; m.commit()?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1292,7 +1294,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x70)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) - .no_ondemand_download()?; + .await?; m.commit()?; assert_eq!( tline @@ -1317,7 +1319,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x80)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) - .no_ondemand_download()?; + .await?; m.commit()?; assert_eq!( tline @@ -1349,12 +1351,12 @@ mod tests { async fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) - .no_ondemand_download()?; + .await?; m.commit()?; // Check that rel exists and size is correct @@ -1391,7 +1393,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) - .no_ondemand_download()?; + .await?; m.commit()?; // Check that rel exists and size is correct @@ -1418,7 +1420,7 @@ mod tests { async fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&tline).await?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; @@ -1427,7 +1429,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) - .no_ondemand_download()?; + .await?; } m.commit()?; @@ -1519,7 +1521,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) - .no_ondemand_download()?; + .await?; } m.commit()?; @@ -1556,7 +1558,7 @@ mod tests { async fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut lsn = 0x10; for blknum in 0..RELSEG_SIZE + 1 { @@ -1565,7 +1567,7 @@ mod tests { let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) - .no_ondemand_download()?; + .await?; m.commit()?; } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 3753807327..aca5e8e019 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -1,6 +1,7 @@ //! Actual Postgres connection handler to stream WAL to the server. use std::{ + error::Error, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -11,7 +12,7 @@ use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; @@ -20,9 +21,7 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use crate::{ - metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate, -}; +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -34,7 +33,7 @@ use crate::{ use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; use pq_proto::ReplicationFeedback; -use utils::lsn::Lsn; +use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -70,10 +69,17 @@ pub async fn handle_walreceiver_connection( let mut config = wal_source_connconf.to_tokio_postgres_config(); config.application_name("pageserver"); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); - time::timeout(connect_timeout, config.connect(postgres::NoTls)) - .await - .context("Timed out while waiting for walreceiver connection to open")? - .context("Failed to open walreceiver connection")? + match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { + Ok(Ok(client_and_conn)) => client_and_conn, + Ok(Err(conn_err)) => { + let expected_error = ignore_expected_errors(conn_err)?; + info!("DB connection stream finished: {expected_error}"); + return Ok(()); + } + Err(elapsed) => anyhow::bail!( + "Timed out while waiting {elapsed} for walreceiver connection to open" + ), + } }; info!("connected!"); @@ -105,10 +111,8 @@ pub async fn handle_walreceiver_connection( connection_result = connection => match connection_result{ Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") + if let Err(e) = ignore_expected_errors(connection_error) { + warn!("Connection aborted: {e:#}") } } }, @@ -175,8 +179,7 @@ pub async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = - with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?; while let Some(replication_message) = { select! { @@ -190,14 +193,9 @@ pub async fn handle_walreceiver_connection( let replication_message = match replication_message { Ok(message) => message, Err(replication_error) => { - if replication_error.is_closed() { - info!("Replication stream got closed"); - return Ok(()); - } else { - return Err( - anyhow::Error::new(replication_error).context("replication stream error") - ); - } + let expected_error = ignore_expected_errors(replication_error)?; + info!("Replication stream finished: {expected_error}"); + return Ok(()); } }; @@ -251,16 +249,10 @@ pub async fn handle_walreceiver_connection( // at risk of hitting a deadlock. ensure!(lsn.is_aligned()); - with_ondemand_download(|| { - walingest.ingest_record( - recdata.clone(), - lsn, - &mut modification, - &mut decoded, - ) - }) - .await - .with_context(|| format!("could not ingest record at {lsn}"))?; + walingest + .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded) + .await + .with_context(|| format!("could not ingest record at {lsn}"))?; fail_point!("walreceiver-after-ingest"); @@ -409,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result Err(IdentifyError.into()) } } + +/// We don't want to report connectivity problems as real errors towards connection manager because +/// 1. they happen frequently enough to make server logs hard to read and +/// 2. the connection manager can retry other safekeeper. +/// +/// If this function returns `Ok(pg_error)`, it's such an error. +/// The caller should log it at info level and then report to connection manager that we're done handling this connection. +/// Connection manager will then handle reconnections. +/// +/// If this function returns an `Err()`, the caller can bubble it up using `?`. +/// The connection manager will log the error at ERROR level. +fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result { + if pg_error.is_closed() + || pg_error + .source() + .and_then(|source| source.downcast_ref::()) + .map(is_expected_io_error) + .unwrap_or(false) + { + return Ok(pg_error); + } else if let Some(db_error) = pg_error.as_db_error() { + if db_error.code() == &SqlState::CONNECTION_FAILURE + && db_error.message().contains("end streaming") + { + return Ok(pg_error); + } + } + Err(pg_error).context("connection error") +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index c6199dddc0..88e3a12d96 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -111,6 +111,7 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; FreeWaitEventSet(pageserver_conn_wes); + pageserver_conn_wes = NULL; neon_log(ERROR, "could not complete handshake with pageserver: %s", msg); @@ -179,7 +180,10 @@ pageserver_disconnect(void) prefetch_on_ps_disconnect(); } if (pageserver_conn_wes != NULL) + { FreeWaitEventSet(pageserver_conn_wes); + pageserver_conn_wes = NULL; + } } static void @@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request) */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { - char *msg = PQerrorMessage(pageserver_conn); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(); neon_log(ERROR, "failed to send page request: %s", msg); @@ -239,29 +243,33 @@ pageserver_receive(void) PG_TRY(); { /* read response */ - resp_buff.len = call_PQgetCopyData(&resp_buff.data); - resp_buff.cursor = 0; + int rc; - if (resp_buff.len < 0) + rc = call_PQgetCopyData(&resp_buff.data); + if (rc >= 0) { - if (resp_buff.len == -1) + resp_buff.len = rc; + resp_buff.cursor = 0; + resp = nm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(); - return NULL; + char *msg = nm_to_string((NeonMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); } - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = nm_unpack_response(&resp_buff); - PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) + else if (rc == -1) { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_log(PageStoreTrace, "got response: %s", msg); - pfree(msg); + pageserver_disconnect(); + resp = NULL; } + else if (rc == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + else + neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc); } PG_CATCH(); { diff --git a/poetry.lock b/poetry.lock index 1b04230cef..edbcddd576 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1418,7 +1418,7 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.0" +version = "65.5.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "main" optional = false @@ -1426,7 +1426,7 @@ python-versions = ">=3.7" [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -2283,8 +2283,8 @@ sarif-om = [ {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, ] setuptools = [ - {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, - {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, + {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, + {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, ] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index e630b2758d..cbc067093e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -2,6 +2,7 @@ name = "proxy" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" @@ -16,12 +17,14 @@ hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" +hyper-tungstenite = "0.8.1" itertools = "0.10.3" md5 = "0.7.0" once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" +regex = "1.4.5" reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] } routerify = "3" rustls = "0.20.0" @@ -35,10 +38,12 @@ thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } tokio-rustls = "0.23.0" +tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] } tracing = "0.1.36" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2.2" uuid = { version = "1.2", features = ["v4", "serde"] } +webpki-roots = "0.22.5" x509-parser = "0.14" metrics = { path = "../libs/metrics" } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 4adf0ed940..e6a179a040 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -149,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. - let fetch_magic_payload = async { + let fetch_magic_payload = |client| async { warn!("project name not specified, resorting to the password hack auth flow"); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) @@ -161,10 +161,26 @@ impl BackendType<'_, ClientCredentials<'_>> { auth::Result::Ok(payload) }; + // If we want to use cleartext password flow, we can read the password + // from the client and pretend that it's a magic payload (PasswordHack hack). + let fetch_plaintext_password = |client| async { + info!("using cleartext password flow"); + let payload = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; + + auth::Result::Ok(auth::password_hack::PasswordHackPayload { + project: String::new(), + password: payload, + }) + }; + // TODO: find a proper way to merge those very similar blocks. let (mut node, payload) = match self { Console(endpoint, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload.await?; + let payload = fetch_magic_payload(client).await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); @@ -174,8 +190,18 @@ impl BackendType<'_, ClientCredentials<'_>> { (node, payload) } + Console(endpoint, creds) if creds.use_cleartext_password_flow => { + // This is a hack to allow cleartext password in secure connections (wss). + let payload = fetch_plaintext_password(client).await?; + let creds = creds.as_ref(); + let node = console::Api::new(endpoint, extra, &creds) + .wake_compute() + .await?; + + (node, payload) + } Postgres(endpoint, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload.await?; + let payload = fetch_magic_payload(client).await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 0a3b84bb52..3b71bef9aa 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> { pub user: &'a str, pub dbname: &'a str, pub project: Option>, + /// If `True`, we'll use the old cleartext password flow. This is used for + /// websocket connections, which want to minimize the number of round trips. + pub use_cleartext_password_flow: bool, } impl ClientCredentials<'_> { @@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> { user: self.user, dbname: self.dbname, project: self.project().map(Cow::Borrowed), + use_cleartext_password_flow: self.use_cleartext_password_flow, } } } @@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> { params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, + use_cleartext_password_flow: bool, ) -> Result { use ClientCredsParseError::*; @@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> { user = user, dbname = dbname, project = project.as_deref(), + use_cleartext_password_flow = use_cleartext_password_flow, "credentials" ); @@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> { user, dbname, project, + use_cleartext_password_flow, }) } } @@ -141,7 +148,7 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); // TODO: check that `creds.dbname` is None. - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); Ok(()) @@ -151,7 +158,7 @@ mod tests { fn parse_missing_project() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); @@ -166,7 +173,7 @@ mod tests { let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -182,7 +189,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -201,7 +208,7 @@ mod tests { let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -220,7 +227,8 @@ mod tests { let sni = Some("second.localhost"); let common_name = Some("localhost"); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = + ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -237,7 +245,8 @@ mod tests { let sni = Some("project.localhost"); let common_name = Some("example.com"); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = + ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); match err { InconsistentSni { sni, cn } => { assert_eq!(sni, "project.localhost"); diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index d9ee50894d..4b982c0c5e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack { } } +/// Use clear-text password auth called `password` in docs +/// +pub struct CleartextPassword; + +impl AuthMethod for CleartextPassword { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -86,6 +97,18 @@ impl AuthFlow<'_, S, PasswordHack> { } } +impl AuthFlow<'_, S, CleartextPassword> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result> { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + Ok(password.to_vec()) + } +} + /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 096a33d73d..e847edc8bd 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,4 +1,5 @@ pub mod server; +pub mod websocket; use crate::url::ApiUrl; diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs new file mode 100644 index 0000000000..33c2752307 --- /dev/null +++ b/proxy/src/http/websocket.rs @@ -0,0 +1,263 @@ +use bytes::{Buf, Bytes}; +use futures::{Sink, Stream, StreamExt}; +use hyper::server::accept::{self}; +use hyper::server::conn::AddrIncoming; +use hyper::upgrade::Upgraded; +use hyper::{Body, Request, Response, StatusCode}; +use hyper_tungstenite::{tungstenite, WebSocketStream}; +use hyper_tungstenite::{tungstenite::Message, HyperWebsocket}; +use pin_project_lite::pin_project; +use tokio::net::TcpListener; + +use std::convert::Infallible; +use std::future::ready; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tls_listener::TlsListener; + +use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; + +use tracing::{error, info, info_span, warn, Instrument}; +use utils::http::{error::ApiError, json::json_response}; + +use crate::cancellation::CancelMap; +use crate::config::ProxyConfig; +use crate::proxy::handle_ws_client; + +pin_project! { + /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite. + pub struct WebSocketRW { + #[pin] + stream: WebSocketStream, + chunk: Option, + } +} + +// FIXME: explain why this is safe or try to remove `unsafe impl`. +unsafe impl Sync for WebSocketRW {} + +impl WebSocketRW { + pub fn new(stream: WebSocketStream) -> Self { + Self { + stream, + chunk: None, + } + } + + fn has_chunk(&self) -> bool { + if let Some(ref chunk) = self.chunk { + chunk.remaining() > 0 + } else { + false + } + } +} + +fn ws_err_into(e: tungstenite::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + +impl AsyncWrite for WebSocketRW { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let mut this = self.project(); + match this.stream.as_mut().poll_ready(cx) { + Poll::Ready(Ok(())) => { + if let Err(e) = this + .stream + .as_mut() + .start_send(Message::Binary(buf.to_vec())) + { + Poll::Ready(Err(ws_err_into(e))) + } else { + Poll::Ready(Ok(buf.len())) + } + } + Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))), + Poll::Pending => { + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } + + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().stream.poll_flush(cx).map_err(ws_err_into) + } + + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().stream.poll_close(cx).map_err(ws_err_into) + } +} + +impl AsyncRead for WebSocketRW { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + if buf.remaining() == 0 { + return Poll::Ready(Ok(())); + } + + let inner_buf = match self.as_mut().poll_fill_buf(cx) { + Poll::Ready(Ok(buf)) => buf, + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + }; + let len = std::cmp::min(inner_buf.len(), buf.remaining()); + buf.put_slice(&inner_buf[..len]); + + self.consume(len); + Poll::Ready(Ok(())) + } +} + +impl AsyncBufRead for WebSocketRW { + fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + if self.as_mut().has_chunk() { + let buf = self.project().chunk.as_ref().unwrap().chunk(); + return Poll::Ready(Ok(buf)); + } else { + match self.as_mut().project().stream.poll_next(cx) { + Poll::Ready(Some(Ok(message))) => match message { + Message::Text(_) => {} + Message::Binary(chunk) => { + *self.as_mut().project().chunk = Some(Bytes::from(chunk)); + } + Message::Ping(_) => { + // No need to send a reply: tungstenite takes care of this for you. + } + Message::Pong(_) => {} + Message::Close(_) => { + // No need to send a reply: tungstenite takes care of this for you. + return Poll::Ready(Ok(&[])); + } + Message::Frame(_) => { + unreachable!(); + } + }, + Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))), + Poll::Ready(None) => return Poll::Ready(Ok(&[])), + Poll::Pending => return Poll::Pending, + } + } + } + } + + fn consume(self: Pin<&mut Self>, amt: usize) { + if amt > 0 { + self.project() + .chunk + .as_mut() + .expect("No chunk present") + .advance(amt); + } + } +} + +async fn serve_websocket( + websocket: HyperWebsocket, + config: &ProxyConfig, + cancel_map: &CancelMap, + session_id: uuid::Uuid, + hostname: Option, +) -> anyhow::Result<()> { + let websocket = websocket.await?; + handle_ws_client( + config, + cancel_map, + session_id, + WebSocketRW::new(websocket), + hostname, + ) + .await?; + Ok(()) +} + +async fn ws_handler( + mut request: Request, + config: &'static ProxyConfig, + cancel_map: Arc, + session_id: uuid::Uuid, +) -> Result, ApiError> { + let host = request + .headers() + .get("host") + .and_then(|h| h.to_str().ok()) + .and_then(|h| h.split(':').next()) + .map(|s| s.to_string()); + + // Check if the request is a websocket upgrade request. + if hyper_tungstenite::is_upgrade_request(&request) { + let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + .map_err(|e| ApiError::BadRequest(e.into()))?; + + tokio::spawn(async move { + if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await + { + error!("error in websocket connection: {:?}", e); + } + }); + + // Return the response so the spawned future can continue. + Ok(response) + } else { + json_response(StatusCode::OK, "Connect with a websocket client") + } +} + +pub async fn task_main( + ws_listener: TcpListener, + config: &'static ProxyConfig, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("websocket server has shut down"); + } + + let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config()); + let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config { + Some(config) => config.into(), + None => { + warn!("TLS config is missing, WebSocket Secure server will not be started"); + return Ok(()); + } + }; + + let addr_incoming = AddrIncoming::from_listener(ws_listener)?; + + let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { + if let Err(err) = conn { + error!("failed to accept TLS connection for websockets: {:?}", err); + ready(false) + } else { + ready(true) + } + }); + + let make_svc = hyper::service::make_service_fn(|_stream| async move { + Ok::<_, Infallible>(hyper::service::service_fn( + move |req: Request| async move { + let cancel_map = Arc::new(CancelMap::default()); + let session_id = uuid::Uuid::new_v4(); + ws_handler(req, config, cancel_map, session_id) + .instrument(info_span!( + "ws-client", + session = format_args!("{session_id}") + )) + .await + }, + )) + }); + + hyper::Server::builder(accept::from_stream(tls_listener)) + .serve(make_svc) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 89ea9142a9..aa6766c102 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -110,12 +110,23 @@ async fn main() -> anyhow::Result<()> { info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; - let tasks = [ + let mut tasks = vec![ tokio::spawn(http::server::task_main(http_listener)), tokio::spawn(proxy::task_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), - ] - .map(flatten_err); + ]; + + if let Some(wss_address) = arg_matches.get_one::("wss") { + let wss_address: SocketAddr = wss_address.parse()?; + info!("Starting wss on {}", wss_address); + let wss_listener = TcpListener::bind(wss_address).await?; + tasks.push(tokio::spawn(http::websocket::task_main( + wss_listener, + config, + ))); + } + + let tasks = tasks.into_iter().map(flatten_err); set_build_info_metric(GIT_VERSION); // This will block until all tasks have completed. @@ -155,6 +166,11 @@ fn cli() -> clap::Command { .help("listen for incoming http connections (metrics, etc) on ip:port") .default_value("127.0.0.1:7001"), ) + .arg( + Arg::new("wss") + .long("wss") + .help("listen for incoming wss connections on ip:port"), + ) .arg( Arg::new("uri") .short('u') diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 2e0a502e7f..cf83b48ae0 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -9,7 +9,10 @@ use std::{ thread, }; use tracing::{error, info, info_span}; -use utils::postgres_backend::{self, AuthType, PostgresBackend}; +use utils::{ + postgres_backend::{self, AuthType, PostgresBackend}, + postgres_backend_async::QueryError, +}; /// Console management API listener thread. /// It spawns console response handlers needed for the link auth. @@ -47,7 +50,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { } } -fn handle_connection(socket: TcpStream) -> anyhow::Result<()> { +fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?; pgbackend.run(&mut MgmtHandler) } @@ -58,7 +61,7 @@ pub type ComputeReady = Result; // TODO: replace with an http-based protocol. struct MgmtHandler; impl postgres_backend::Handler for MgmtHandler { - fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> { + fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { try_process_query(pgb, query).map_err(|e| { error!("failed to process response: {e:?}"); e @@ -66,8 +69,8 @@ impl postgres_backend::Handler for MgmtHandler { } } -fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> { - let resp: KickSession = serde_json::from_str(query)?; +fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { + let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); @@ -81,7 +84,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<( } Err(e) => { error!("failed to deliver response to per-client task"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?; } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 382f7cd918..63573d49c0 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -82,6 +82,47 @@ pub async fn task_main( } } +pub async fn handle_ws_client( + config: &ProxyConfig, + cancel_map: &CancelMap, + session_id: uuid::Uuid, + stream: impl AsyncRead + AsyncWrite + Unpin + Send, + hostname: Option, +) -> anyhow::Result<()> { + // The `closed` counter will increase when this future is destroyed. + NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); + scopeguard::defer! { + NUM_CONNECTIONS_CLOSED_COUNTER.inc(); + } + + let tls = config.tls_config.as_ref(); + let hostname = hostname.as_deref(); + + // TLS is None here, because the connection is already encrypted. + let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake")); + let (mut stream, params) = match do_handshake.await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; + + // Extract credentials which we're going to use for auth. + let creds = { + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .as_ref() + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + + let client = Client::new(stream, creds, ¶ms, session_id); + cancel_map + .with_session(|session| client.connect_to_db(session)) + .await +} + async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, @@ -108,7 +149,7 @@ async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 19e1479068..02a0fabe9a 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -2,7 +2,7 @@ use crate::error::UserFacingError; use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; +use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; @@ -47,18 +47,13 @@ fn err_connection() -> io::Error { io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") } -// TODO: change error type of `FeMessage::read_fut` -fn from_anyhow(e: anyhow::Error) -> io::Error { - io::Error::new(io::ErrorKind::Other, e.to_string()) -} - impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. pub async fn read_startup_packet(&mut self) -> io::Result { // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` let msg = FeStartupPacket::read_fut(&mut self.stream) .await - .map_err(from_anyhow)? + .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection)?; match msg { @@ -80,7 +75,7 @@ impl PqStream { async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) .await - .map_err(from_anyhow)? + .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection) } } @@ -112,7 +107,8 @@ impl PqStream { /// This method exists due to `&str` not implementing `Into`. pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { tracing::info!("forwarding error to user: {error}"); - self.write_message(&BeMessage::ErrorResponse(error)).await?; + self.write_message(&BeMessage::ErrorResponse(error, None)) + .await?; bail!(error) } @@ -124,7 +120,8 @@ impl PqStream { { let msg = error.to_string_client(); tracing::info!("forwarding error to user: {msg}"); - self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + self.write_message(&BeMessage::ErrorResponse(&msg, None)) + .await?; bail!(error) } } diff --git a/run_clippy.sh b/run_clippy.sh index bf770432d0..fe0e745d7d 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -9,8 +9,8 @@ # In vscode, this setting is Rust-analyzer>Check On Save:Command -# Not every feature is supported in macOS builds, e.g. `profiling`, -# avoid running regular linting script that checks every feature. +# Not every feature is supported in macOS builds. Avoid running regular linting +# script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index fbcb3f34f7..d0c804fe4e 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -2,6 +2,7 @@ name = "safekeeper" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] async-stream = "0.3" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 5ad88276e8..b130ea86bd 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -143,6 +143,19 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + let auth = match args.auth_validation_public_key_path.as_ref() { + None => { + info!("auth is disabled"); + None + } + Some(path) => { + info!("loading JWT auth key from {}", path.display()); + Some(Arc::new( + JwtAuth::from_key_path(path).context("failed to load the auth key")?, + )) + } + }; + let conf = SafeKeeperConf { workdir, my_id: id, @@ -156,7 +169,7 @@ fn main() -> anyhow::Result<()> { max_offloader_lag_bytes: args.max_offloader_lag, backup_runtime_threads: args.wal_backup_threads, wal_backup_enabled: !args.disable_wal_backup, - auth_validation_public_key_path: args.auth_validation_public_key_path, + auth, }; // initialize sentry if SENTRY_DSN is provided @@ -186,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { e })?; - let auth = match conf.auth_validation_public_key_path.as_ref() { - None => { - info!("auth is disabled"); - None - } - Some(path) => { - info!("loading JWT auth key from {}", path.display()); - Some(Arc::new( - JwtAuth::from_key_path(path).context("failed to load the auth key")?, - )) - } - }; - // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(); @@ -212,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); - let auth_ = auth.clone(); threads.push( thread::Builder::new() .name("http_endpoint_thread".into()) .spawn(|| { - let router = http::make_router(conf_, auth_); + let router = http::make_router(conf_); endpoint::serve_thread_main( router, http_listener, @@ -230,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("safekeeper thread".into()) - .spawn(|| { - if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) { - info!("safekeeper thread terminated: {e}"); - } - }) + .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) .unwrap(); threads.push(safekeeper_thread); @@ -244,7 +239,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { thread::Builder::new() .name("broker thread".into()) .spawn(|| { - // TODO: add auth? broker::thread_main(conf_); })?, ); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 05527303ca..60df5dd372 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -8,16 +8,16 @@ use crate::receive_wal::ReceiveWalConn; use crate::send_wal::ReplicationConn; use crate::{GlobalTimelines, SafeKeeperConf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use postgres_ffi::PG_TLI; use regex::Regex; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use std::str; -use std::sync::Arc; use tracing::info; -use utils::auth::{Claims, JwtAuth, Scope}; +use utils::auth::{Claims, Scope}; +use utils::postgres_backend_async::QueryError; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, @@ -32,7 +32,6 @@ pub struct SafekeeperPostgresHandler { pub tenant_id: Option, pub timeline_id: Option, pub ttid: TenantTimelineId, - auth: Option>, claims: Option, } @@ -44,7 +43,7 @@ enum SafekeeperPostgresCommand { JSONCtrl { cmd: AppendLogicalMessage }, } -fn parse_cmd(cmd: &str) -> Result { +fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { @@ -64,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result { cmd: serde_json::from_str(cmd)?, }) } else { - bail!("unsupported command {}", cmd); + anyhow::bail!("unsupported command {cmd}"); } } impl postgres_backend::Handler for SafekeeperPostgresHandler { // tenant_id and timeline_id are passed in connection string params - fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + sm: &FeStartupPacket, + ) -> Result<(), QueryError> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { @@ -79,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { Some(("ztenantid", value)) | Some(("tenant_id", value)) => { - self.tenant_id = Some(value.parse()?); + self.tenant_id = Some(value.parse().with_context(|| { + format!("Failed to parse {value} as tenant id") + })?); } Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { - self.timeline_id = Some(value.parse()?); + self.timeline_id = Some(value.parse().with_context(|| { + format!("Failed to parse {value} as timeline id") + })?); } _ => continue, } @@ -95,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } else { - bail!("Safekeeper received unexpected initial message: {:?}", sm); + Err(QueryError::Other(anyhow::anyhow!( + "Safekeeper received unexpected initial message: {sm:?}" + ))) } } @@ -103,20 +112,20 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self + .conf .auth .as_ref() .unwrap() - .decode(str::from_utf8(jwt_response)?)?; + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; - if matches!(data.claims.scope, Scope::Tenant) { - ensure!( - data.claims.tenant_id.is_some(), + if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { + return Err(QueryError::Other(anyhow::anyhow!( "jwt token scope is Tenant, but tenant id is missing" - ) + ))); } info!( @@ -128,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } - fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { + fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError> { if query_string .to_ascii_lowercase() .starts_with("set datestyle to ") @@ -149,39 +162,45 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { self.check_permission(Some(tenant_id))?; self.ttid = TenantTimelineId::new(tenant_id, timeline_id); - match cmd { + let res = match cmd { SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), SafekeeperPostgresCommand::StartReplication { start_lsn } => { ReplicationConn::new(pgb).run(self, pgb, start_lsn) } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), - } - .context(format!( - "Failed to process query for timeline {timeline_id}" - ))?; + }; - Ok(()) + match res { + Ok(()) => Ok(()), + Err(QueryError::Disconnected(connection_error)) => { + info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}"); + Err(QueryError::Disconnected(connection_error)) + } + Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!( + "Failed to process query for timeline {}", + self.ttid + )))), + } } } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, auth: Option>) -> Self { + pub fn new(conf: SafeKeeperConf) -> Self { SafekeeperPostgresHandler { conf, appname: None, tenant_id: None, timeline_id: None, ttid: TenantTimelineId::empty(), - auth, claims: None, } } // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { - if self.auth.is_none() { + fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { + if self.conf.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); } @@ -198,7 +217,7 @@ impl SafekeeperPostgresHandler { /// /// Handle IDENTIFY_SYSTEM replication command /// - fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid)?; let lsn = if self.is_walproposer_recovery() { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a9a9eb3388..a917d61678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request) -> Result>, -) -> RouterBuilder { +pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); - if auth.is_some() { + if conf.auth.is_some() { router = router.middleware(auth_middleware(|request| { #[allow(clippy::mutable_key_type)] static ALLOWLIST_ROUTES: Lazy> = @@ -298,6 +295,7 @@ pub fn make_router( // NB: on any changes do not forget to update the OpenAPI spec // located nearby (/safekeeper/src/http/openapi_spec.yaml). + let auth = conf.auth.clone(); router .data(Arc::new(conf)) .data(auth) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 746b4461b7..32a24a4978 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -8,11 +8,12 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::Context; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; use utils::id::TenantTimelineId; +use utils::postgres_backend_async::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; @@ -47,7 +48,7 @@ pub struct AppendLogicalMessage { pg_version: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct AppendResult { // safekeeper state after append state: SafeKeeperState, @@ -62,8 +63,8 @@ pub fn handle_json_ctrl( spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, -) -> Result<()> { - info!("JSON_CTRL request: {:?}", append_request); +) -> Result<(), QueryError> { + info!("JSON_CTRL request: {append_request:?}"); // need to init safekeeper state before AppendRequest let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; @@ -78,7 +79,8 @@ pub fn handle_json_ctrl( state: tli.get_state().1, inserted_wal, }; - let response_data = serde_json::to_vec(&response)?; + let response_data = serde_json::to_vec(&response) + .with_context(|| format!("Response {response:?} is not a json array"))?; pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { name: b"json", @@ -93,7 +95,7 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result> { GlobalTimelines::create( ttid, ServerInfo { @@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { // add new term to existing history let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); @@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<() Ok(()) } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct InsertedWAL { begin_lsn: Lsn, end_lsn: Lsn, @@ -134,7 +136,10 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { +fn append_logical_message( + tli: &Arc, + msg: &AppendLogicalMessage, +) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = tli.get_state().1; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 5decfe64de..891d73533f 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -24,7 +24,9 @@ pub mod wal_service; pub mod wal_storage; mod timelines_global_map; +use std::sync::Arc; pub use timelines_global_map::GlobalTimelines; +use utils::auth::JwtAuth; pub mod defaults { pub use safekeeper_api::{ @@ -57,7 +59,7 @@ pub struct SafeKeeperConf { pub max_offloader_lag_bytes: u64, pub backup_runtime_threads: Option, pub wal_backup_enabled: bool, - pub auth_validation_public_key_path: Option, + pub auth: Option>, } impl SafeKeeperConf { @@ -87,7 +89,7 @@ impl SafeKeeperConf { broker_keepalive_interval: Duration::from_secs(5), backup_runtime_threads: None, wal_backup_enabled: true, - auth_validation_public_key_path: None, + auth: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index be7f071abb..671e5470a0 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,11 +2,13 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use anyhow::{anyhow, bail, Result}; +use anyhow::anyhow; +use anyhow::Context; use bytes::BytesMut; use tracing::*; use utils::lsn::Lsn; +use utils::postgres_backend_async::QueryError; use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; @@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> { } // Send message to the postgres - fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> { + fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> { let mut buf = BytesMut::with_capacity(128); msg.serialize(&mut buf)?; self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; @@ -51,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> { } /// Receive WAL from wal_proposer - pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { + pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> { let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages @@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> { }; GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? } - _ => bail!("unexpected message {:?} instead of greeting", next_msg), + _ => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message {next_msg:?} instead of greeting" + ))) + } }; let mut next_msg = Some(next_msg); @@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> { struct ProposerPollStream { msg_rx: Receiver, - read_thread: Option>>, + read_thread: Option>>, } impl ProposerPollStream { - fn new(mut r: ReadStream) -> Result { + fn new(mut r: ReadStream) -> anyhow::Result { let (msg_tx, msg_rx) = channel(); let read_thread = thread::Builder::new() .name("Read WAL thread".into()) - .spawn(move || -> Result<()> { + .spawn(move || -> Result<(), QueryError> { loop { let copy_data = match FeMessage::read(&mut r)? { - Some(FeMessage::CopyData(bytes)) => bytes, - Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), - None => bail!("connection closed unexpectedly"), - }; + Some(FeMessage::CopyData(bytes)) => Ok(bytes), + Some(msg) => Err(QueryError::Other(anyhow::anyhow!( + "expected `CopyData` message, found {msg:?}" + ))), + None => Err(QueryError::from(std::io::Error::new( + std::io::ErrorKind::ConnectionAborted, + "walproposer closed the connection", + ))), + }?; let msg = ProposerAcceptorMessage::parse(copy_data)?; - msg_tx.send(msg)?; + msg_tx + .send(msg) + .context("Failed to send the proposer message")?; } // msg_tx will be dropped here, this will also close msg_rx })?; @@ -163,17 +176,19 @@ impl ProposerPollStream { }) } - fn recv_msg(&mut self) -> Result { + fn recv_msg(&mut self) -> Result { self.msg_rx.recv().map_err(|_| { // return error from the read thread let res = match self.read_thread.take() { Some(thread) => thread.join(), - None => return anyhow!("read thread is gone"), + None => return QueryError::Other(anyhow::anyhow!("read thread is gone")), }; match res { - Ok(Ok(())) => anyhow!("unexpected result from read thread"), - Err(err) => anyhow!("read thread panicked: {:?}", err), + Ok(Ok(())) => { + QueryError::Other(anyhow::anyhow!("unexpected result from read thread")) + } + Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")), Ok(Err(err)) => err, } }) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a054b8fe14..20600ab694 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; use crate::GlobalTimelines; -use anyhow::{bail, Context, Result}; +use anyhow::Context; use bytes::Bytes; use postgres_ffi::get_current_timestamp; @@ -15,7 +15,8 @@ use std::cmp::min; use std::net::Shutdown; use std::sync::Arc; use std::time::Duration; -use std::{str, thread}; +use std::{io, str, thread}; +use utils::postgres_backend_async::QueryError; use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; @@ -91,7 +92,7 @@ impl ReplicationConn { fn background_thread( mut stream_in: ReadStream, replica_guard: Arc, - ) -> Result<()> { + ) -> anyhow::Result<()> { let replica_id = replica_guard.replica; let timeline = &replica_guard.timeline; @@ -140,7 +141,7 @@ impl ReplicationConn { // Shutdown the connection, because rust-postgres client cannot be dropped // when connection is alive. let _ = stream_in.shutdown(Shutdown::Both); - bail!("Copy failed"); + anyhow::bail!("Copy failed"); } _ => { // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. @@ -160,7 +161,7 @@ impl ReplicationConn { spg: &mut SafekeeperPostgresHandler, pgb: &mut PostgresBackend, mut start_pos: Lsn, - ) -> Result<()> { + ) -> Result<(), QueryError> { let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); let tli = GlobalTimelines::get(spg.ttid)?; @@ -256,8 +257,10 @@ impl ReplicationConn { // to right pageserver. if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. - // TODO create proper error type for this - bail!("end streaming to {:?}", spg.appname); + return Err(QueryError::from(io::Error::new( + io::ErrorKind::ConnectionAborted, + format!("end streaming to {:?}", spg.appname), + ))); } // timeout expired: request pageserver status @@ -265,8 +268,7 @@ impl ReplicationConn { sent_ptr: end_pos.0, timestamp: get_current_timestamp(), request_reply: true, - })) - .context("Failed to send KeepAlive message")?; + }))?; continue; } } @@ -301,7 +303,7 @@ impl ReplicationConn { const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); // Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. -async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> Result> { +async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { let commit_lsn: Lsn = *rx.borrow(); if commit_lsn > lsn { return Ok(Some(commit_lsn)); diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index fd8f9d9dcf..3ca651d060 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,35 +2,28 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::Result; use regex::Regex; use std::net::{TcpListener, TcpStream}; -use std::sync::Arc; use std::thread; use tracing::*; -use utils::auth::JwtAuth; +use utils::postgres_backend_async::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main( - conf: SafeKeeperConf, - listener: TcpListener, - auth: Option>, -) -> Result<()> { +pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! { loop { match listener.accept() { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); - let auth = auth.clone(); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf, auth) { + if let Err(err) = handle_socket(socket, conf) { error!("connection handler exited: {}", err); } }) @@ -51,25 +44,17 @@ fn get_tid() -> u64 { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket( - socket: TcpStream, - conf: SafeKeeperConf, - auth: Option>, -) -> Result<()> { +fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> { let _enter = info_span!("", tid = ?get_tid()).entered(); socket.set_nodelay(true)?; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone()); - let pgbackend = PostgresBackend::new( - socket, - match auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }, - None, - false, - )?; + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = SafekeeperPostgresHandler::new(conf); + let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?; // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 7aa33a5234..180c506254 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -2,6 +2,7 @@ name = "storage_broker" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [features] bench = [] diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 9236137d19..8b78e06c22 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -40,10 +40,9 @@ def parse_metrics(text: str, name: str = "") -> Metrics: PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( - "pageserver_remote_upload_queue_unfinished_tasks", - "pageserver_remote_operation_seconds_bucket", - "pageserver_remote_operation_seconds_count", - "pageserver_remote_operation_seconds_sum", + "pageserver_remote_timeline_client_calls_unfinished", + *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]], + *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]], "pageserver_remote_physical_size", ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5b00ebdea7..f284be8753 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -18,6 +18,7 @@ from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto from functools import cached_property +from itertools import chain, product from pathlib import Path from types import TracebackType from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast @@ -34,6 +35,7 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -595,6 +597,7 @@ class NeonEnvBuilder: rust_log_override: Optional[str] = None, default_branch_name: str = DEFAULT_BRANCH_NAME, preserve_database_files: bool = False, + initial_tenant: Optional[TenantId] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -617,8 +620,9 @@ class NeonEnvBuilder: self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files + self.initial_tenant = initial_tenant or TenantId.generate() - def init(self) -> NeonEnv: + def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" self.env = NeonEnv(self) @@ -629,8 +633,17 @@ class NeonEnvBuilder: self.env.start() def init_start(self) -> NeonEnv: - env = self.init() + env = self.init_configs() self.start() + + # Prepare the default branch to start the postgres on later. + # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. + log.info( + f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" + ) + initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") + return env def enable_remote_storage( @@ -889,12 +902,12 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = TenantId.generate() + self.initial_tenant = config.initial_tenant # Create a config file corresponding to the options toml = textwrap.dedent( f""" - default_tenant_id = '{self.initial_tenant}' + default_tenant_id = '{config.initial_tenant}' """ ) @@ -1409,6 +1422,33 @@ class PageserverHttpClient(requests.Session): ] return sample.value + def get_remote_timeline_client_metric( + self, + metric_name: str, + tenant_id: TenantId, + timeline_id: TimelineId, + file_kind: str, + op_kind: str, + ) -> Optional[float]: + metrics = parse_metrics(self.get_metrics(), "pageserver") + matches = metrics.query_all( + name=metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ) + if len(matches) == 0: + value = None + elif len(matches) == 1: + value = matches[0].value + assert value is not None + else: + assert len(matches) < 2, "above filter should uniquely identify metric" + return value + def get_metric_value(self, name: str) -> Optional[str]: metrics = self.get_metrics() relevant = [line for line in metrics.splitlines() if line.startswith(name)] @@ -1528,6 +1568,7 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, conf: Optional[Dict[str, str]] = None, + set_default: bool = False, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1536,47 +1577,51 @@ class NeonCli(AbstractNeonCli): tenant_id = TenantId.generate() if timeline_id is None: timeline_id = TimelineId.generate() - if conf is None: - res = self.raw_cli( - [ - "tenant", - "create", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - ) - else: - res = self.raw_cli( - [ - "tenant", - "create", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + + args = [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + self.env.pg_version, + ] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) ) + if set_default: + args.append("--set-default") + + res = self.raw_cli(args) res.check_returncode() return tenant_id, timeline_id + def set_default(self, tenant_id: TenantId): + """ + Update default tenant for future operations that require tenant_id. + """ + res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)]) + res.check_returncode() + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): """ Update tenant config. """ - if conf is None: - res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)]) - else: - res = self.raw_cli( - ["tenant", "config", "--tenant-id", str(tenant_id)] - + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + + args = ["tenant", "config", "--tenant-id", str(tenant_id)] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) ) + + res = self.raw_cli(args) res.check_returncode() def list_tenants(self) -> "subprocess.CompletedProcess[str]": @@ -1611,36 +1656,6 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch( - self, - branch_name: str, - tenant_id: Optional[TenantId] = None, - ): - cmd = [ - "timeline", - "create", - "--branch-name", - branch_name, - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - "--pg-version", - self.env.pg_version, - ] - - res = self.raw_cli(cmd) - res.check_returncode() - - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - if created_timeline_id is None: - raise Exception("could not find timeline id after `neon timeline create` invocation") - else: - return TimelineId(created_timeline_id) - def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, @@ -1696,17 +1711,12 @@ class NeonCli(AbstractNeonCli): def init( self, config_toml: str, - initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) tmp.flush() - cmd = ["init", f"--config={tmp.name}"] - if initial_timeline_id: - cmd.extend(["--timeline-id", str(initial_timeline_id)]) - - cmd.extend(["--pg-version", self.env.pg_version]) + cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] append_pageserver_param_overrides( params_to_update=cmd, @@ -1903,14 +1913,17 @@ class NeonPageserver(PgProtocol): ".*wal receiver task finished with an error: walreceiver connection handling failure.*", ".*Shutdown task error: walreceiver connection handling failure.*", ".*wal_connection_manager.*tcp connect error: Connection refused.*", - ".*query handler for .* failed: Connection reset by peer.*", - ".*serving compute connection task.*exited with error: Broken pipe.*", - ".*Connection aborted: error communicating with the server: Broken pipe.*", - ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*", - ".*Connection aborted: error communicating with the server: Connection reset by peer.*", + ".*query handler for .* failed: Socket IO error: Connection reset by peer.*", + ".*serving compute connection task.*exited with error: Postgres connection error.*", + ".*serving compute connection task.*exited with error: Connection reset by peer.*", + ".*serving compute connection task.*exited with error: Postgres query error.*", + ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*", + ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*", + ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*", ".*kill_and_wait_impl.*: wait successful.*", - ".*end streaming to Some.*", + ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*", ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down + ".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation # and streaming start ".*Failed to process query for timeline .*: state uninitialized, no data to read.*", @@ -1980,10 +1993,6 @@ class NeonPageserver(PgProtocol): if '"testing"' not in self.version: pytest.skip("pageserver was built without 'testing' feature") - def is_profiling_enabled_or_skip(self): - if '"profiling"' not in self.version: - pytest.skip("pageserver was built without 'profiling' feature") - def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1fb9eb72e6..df83fc6377 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int: ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" + r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" ) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index a32ce87c33..c1a57fb28b 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -1,12 +1,8 @@ # Running locally -First make a release build. The profiling flag is optional, used only for tests that -generate flame graphs. The `-s` flag just silences a lot of output, and makes it +First make a release build. The `-s` flag silences a lot of output, and makes it easier to see if you have compile errors without scrolling up. -`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8` - -NOTE: the `profiling` flag only works on linux because we use linux-specific -libc APIs like `libc::timer_t`. +`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8` Then run the tests `NEON_BIN=./target/release poetry run pytest test_runner/performance"` diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 50e5366c1e..2b8760dff2 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -8,7 +8,7 @@ from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult -from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.compare_fixtures import PgCompare from fixtures.utils import get_scale_for_db @@ -176,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY) -# Run the pgbench tests, and generate a flamegraph from it -# This requires that the pageserver was built with the 'profiling' feature. -# -# TODO: If the profiling is cheap enough, there's no need to run the same test -# twice, with and without profiling. But for now, run it separately, so that we -# can see how much overhead the profiling adds. -@pytest.mark.parametrize("scale", get_scales_matrix()) -@pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): - neon_env_builder.pageserver_config_override = """ -profiling="page_requests" -""" - env = neon_env_builder.init_start() - env.pageserver.is_profiling_enabled_or_skip() - env.neon_cli.create_branch("empty", "main") - - neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench") - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT) - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) - - # The following 3 tests run on an existing database as it was set up by previous tests, # and leaves the database in a state that would be used in the next tests. # Modifying the definition order of these functions or adding other remote tests in between will alter results. diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py old mode 100644 new mode 100755 diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index ac9f163801..d1fcab7a62 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -1,3 +1,5 @@ +import time + import pytest from fixtures.log_helper import log from fixtures.metrics import parse_metrics @@ -20,9 +22,19 @@ def httpserver_listen_address(port_distributor: PortDistributor): return ("localhost", port) -num_metrics_received = 0 +initial_tenant = TenantId.generate() remote_uploaded = 0 -first_request = True +checks = { + "written_size": lambda value: value > 0, + "resident_size": lambda value: value >= 0, + # >= 0 check here is to avoid race condition when we receive metrics before + # remote_uploaded is updated + "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, + # logical size may lag behind the actual size, so allow 0 here + "timeline_logical_size": lambda value: value >= 0, +} + +metric_kinds_checked = set([]) # @@ -36,38 +48,19 @@ def metrics_handler(request: Request) -> Response: log.info("received events:") log.info(events) - checks = { - "written_size": lambda value: value > 0, - "resident_size": lambda value: value >= 0, - # >= 0 check here is to avoid race condition when we receive metrics before - # remote_uploaded is updated - "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, - # logical size may lag behind the actual size, so allow 0 here - "timeline_logical_size": lambda value: value >= 0, - } - - events_received = 0 for event in events: - check = checks.get(event["metric"]) + assert event["tenant_id"] == str( + initial_tenant + ), "Expecting metrics only from the initial tenant" + metric_name = event["metric"] + + check = checks.get(metric_name) # calm down mypy if check is not None: - assert check(event["value"]), f"{event['metric']} isn't valid" - events_received += 1 + assert check(event["value"]), f"{metric_name} isn't valid" + global metric_kinds_checked + metric_kinds_checked.add(metric_name) - global first_request - # check that all checks were sent - # but only on the first request, because we don't send non-changed metrics - if first_request: - # we may receive more metrics than we check, - # because there are two timelines - # and we may receive per-timeline metrics from both - # if the test was slow enough for these metrics to be collected - # -1 because that is ok to not receive timeline_logical_size - assert events_received >= len(checks) - 1 - first_request = False - - global num_metrics_received - num_metrics_received += 1 return Response(status=200) @@ -83,11 +76,14 @@ def test_metric_collection( (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + # Require collecting metrics frequently, since we change + # the timeline and want something to be logged about it. + # # Disable time-based pitr, we will use the manual GC calls # to trigger remote storage operations in a controlled way neon_env_builder.pageserver_config_override = ( f""" - metric_collection_interval="60s" + metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" """ + "tenant_config={pitr_interval = '0 sec'}" @@ -100,6 +96,9 @@ def test_metric_collection( log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") + # Set initial tenant of the test, that we expect the logs from + global initial_tenant + initial_tenant = neon_env_builder.initial_tenant # mock http server that returns OK for the metrics httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( metrics_handler @@ -107,6 +106,9 @@ def test_metric_collection( # spin up neon, after http server is ready env = neon_env_builder.init_start() + # Order of fixtures shutdown is not specified, and if http server gets down + # before pageserver, pageserver log might contain such errors in the end. + env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") env.neon_cli.create_branch("test_metric_collection") pg = env.postgres.create_start("test_metric_collection") @@ -151,7 +153,11 @@ def test_metric_collection( remote_uploaded = get_num_remote_ops("index", "upload") assert remote_uploaded > 0 - # check that all requests are served + # wait longer than collecting interval and check that all requests are served + time.sleep(3) httpserver.check() - global num_metrics_received - assert num_metrics_received > 0, "no metrics were received" + global metric_kinds_checked, checks + expected_checks = set(checks.keys()) + assert len(metric_kinds_checked) == len( + checks + ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered" diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 6c7cdb6f7f..bd0f550ba5 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -1,10 +1,17 @@ -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor # Test that neon cli is able to start and stop all processes with the user defaults. -# def test_neon_cli_basics(neon_simple_env: NeonEnv): -def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init() +# Repeats the example from README.md as close as it can +def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor): + env = neon_env_builder.init_configs() + # Skipping the init step that creates a local tenant in Pytest tests + try: + env.neon_cli.start() + env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) + env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port()) - env.neon_cli.start() - env.neon_cli.stop() + env.neon_cli.create_branch(new_branch_name="migration_check") + env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port()) + finally: + env.neon_cli.stop() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 352ae4b95c..184dc13888 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -120,7 +120,7 @@ def test_ondemand_download_large_rel( # -# If you have a relation with a long history of updates,the pageserver downloads the layer +# If you have a relation with a long history of updates, the pageserver downloads the layer # files containing the history as needed by timetravel queries. # @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -189,13 +189,10 @@ def test_ondemand_download_timetravel( # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) - # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(client, tenant_id, timeline_id, current_lsn) - log.info("uploads have finished") - ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() + # wait until pageserver has successfully uploaded all the data to remote storage wait_for_sk_commit_lsn_to_reach_remote_storage( tenant_id, timeline_id, env.safekeepers, env.pageserver ) @@ -227,11 +224,15 @@ def test_ondemand_download_timetravel( wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) - # current_physical_size reports sum of layer file sizes, regardless of local or remote + # The current_physical_size reports the sum of layers loaded in the layer + # map, regardless of where the layer files are located. So even though we + # just removed the local files, they still count towards + # current_physical_size because they are loaded as `RemoteLayer`s. assert filled_current_physical == get_api_current_physical_size() + # Run queries at different points in time num_layers_downloaded = [0] - physical_size = [get_resident_physical_size()] + resident_size = [get_resident_physical_size()] for (checkpoint_number, lsn) in lsns: pg_old = env.postgres.create_start( branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn @@ -268,13 +269,15 @@ def test_ondemand_download_timetravel( if len(num_layers_downloaded) > 4: assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] - # Likewise, assert that the physical_size metric grows as layers are downloaded - physical_size.append(get_resident_physical_size()) - log.info(f"physical_size[-1]={physical_size[-1]}") - if len(physical_size) > 4: - assert physical_size[-1] > physical_size[len(physical_size) - 4] + # Likewise, assert that the resident_physical_size metric grows as layers are downloaded + resident_size.append(get_resident_physical_size()) + log.info(f"resident_size[-1]={resident_size[-1]}") + if len(resident_size) > 4: + assert resident_size[-1] > resident_size[len(resident_size) - 4] - # current_physical_size reports sum of layer file sizes, regardless of local or remote + # current_physical_size reports the total size of all layer files, whether + # they are present only in the remote storage, only locally, or both. + # It should not change. assert filled_current_physical == get_api_current_physical_size() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 1e93958e98..09644eaaa1 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - env = neon_env_builder.init() + env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() - neon_env_builder.start() - # These warnings are expected, when the pageserver is restarted abruptly env.pageserver.allowed_errors.append(".*found future delta layer.*") env.pageserver.allowed_errors.append(".*found future image layer.*") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 32c25b2e8c..82bf741a8f 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -2,11 +2,11 @@ # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os -import re import shutil import threading import time from pathlib import Path +from typing import Dict, List, Tuple import pytest from fixtures.log_helper import log @@ -271,14 +271,15 @@ def test_remote_storage_upload_queue_retries( wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): - metrics = client.get_metrics() - matches = re.search( - f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, ) - assert matches - return int(matches[1]) + assert val is not None, "expecting metric to be present" + return int(val) # create some layers & wait for uploads to finish overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a") @@ -368,6 +369,168 @@ def test_remote_storage_upload_queue_retries( assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_remote_timeline_client_calls_started_metric( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_timeline_client_metrics", + ) + + env = neon_env_builder.init_start() + + # create tenant with config that will determinstically allow + # compaction and gc + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # don't create image layers, that causes just noise + "image_creation_threshold": "10000", + } + ) + + client = env.pageserver.http_client() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): + # create initial set of layers & upload them with failpoints configured + pg.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 10000) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + def get_queued_count(file_kind, op_kind): + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, + ) + if val is None: + return val + return int(val) + + def wait_upload_queue_empty(): + wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) + wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) + wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + + calls_started: Dict[Tuple[str, str], List[int]] = { + ("layer", "upload"): [0], + ("index", "upload"): [0], + ("layer", "delete"): [0], + } + + def fetch_calls_started(): + for (file_kind, op_kind), observations in calls_started.items(): + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_started_count", + tenant_id, + timeline_id, + file_kind, + op_kind, + ) + assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}" + val = int(val) + observations.append(val) + + def ensure_calls_started_grew(): + for (file_kind, op_kind), observations in calls_started.items(): + log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") + assert all( + x < y for x, y in zip(observations, observations[1:]) + ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" + + def churn(data_pass1, data_pass2): + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_compact(tenant_id, timeline_id) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_compact(tenant_id, timeline_id) + gc_result = client.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # create some layers & wait for uploads to finish + churn("a", "b") + + wait_upload_queue_empty() + + # ensure that we updated the calls_started metric + fetch_calls_started() + ensure_calls_started_grew() + + # more churn to cause more operations + churn("c", "d") + + # ensure that the calls_started metric continued to be updated + fetch_calls_started() + ensure_calls_started_grew() + + ### now we exercise the download path + calls_started.clear() + calls_started.update( + { + ("index", "download"): [0], + ("layer", "download"): [0], + } + ) + + env.pageserver.stop(immediate=True) + env.postgres.stop_all() + + dir_to_clear = Path(env.repo_dir) / "tenants" + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + env.pageserver.start() + client = env.pageserver.http_client() + + client.tenant_attach(tenant_id) + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + + wait_until(30, 1, tenant_active) + + log.info("restarting postgres to validate") + pg = env.postgres.create_start("main", tenant_id=tenant_id) + with pg.cursor() as cur: + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 + + # ensure that we updated the calls_started download metric + fetch_calls_started() + ensure_calls_started_grew() + + # Test that we correctly handle timeline with layers stuck in upload queue @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_timeline_deletion_with_files_stuck_in_upload_queue( @@ -401,15 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( client = env.pageserver.http_client() def get_queued_count(file_kind, op_kind): - metrics = client.get_metrics() - matches = re.search( - f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, ) - if matches is None: - return None - return int(matches[1]) + return int(val) if val is not None else val pg = env.postgres.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 6d621fbb77..29cdcb18ce 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 100, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, # 7 days }.items() ) @@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 30, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) @@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) @@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 6963a57542..db5bb679f2 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,9 +1,13 @@ +import asyncio +import random import time from threading import Thread +import asyncpg import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, PageserverApiException, PageserverHttpClient, @@ -12,6 +16,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -84,6 +89,150 @@ def test_tenant_reattach( assert env.pageserver.log_contains(".*download.*failed, will retry.*") +num_connections = 10 +num_rows = 100000 +updates_to_perform = 0 + +updates_started = 0 +updates_finished = 0 + + +# Run random UPDATEs on test table. On failure, try again. +async def update_table(pg_conn: asyncpg.Connection): + global updates_started, updates_finished, updates_to_perform + + while updates_started < updates_to_perform or updates_to_perform == 0: + updates_started += 1 + id = random.randrange(1, num_rows) + + # Loop to retry until the UPDATE succeeds + while True: + try: + await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}") + updates_finished += 1 + if updates_finished % 1000 == 0: + log.info(f"update {updates_finished} / {updates_to_perform}") + break + except asyncpg.PostgresError as e: + # Received error from Postgres. Log it, sleep a little, and continue + log.info(f"UPDATE error: {e}") + await asyncio.sleep(0.1) + + +async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId): + global updates_started, updates_finished, updates_to_perform + + # Wait until we have performed some updates + wait_until(20, 0.5, lambda: updates_finished > 500) + + log.info("Detaching tenant") + pageserver_http.tenant_detach(tenant_id) + await asyncio.sleep(1) + log.info("Re-attaching tenant") + pageserver_http.tenant_attach(tenant_id) + log.info("Re-attach finished") + + # Continue with 5000 more updates + updates_to_perform = updates_started + 5000 + + +# async guts of test_tenant_reattach_while_bysy test +async def reattach_while_busy( + env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId +): + workers = [] + for worker_id in range(num_connections): + pg_conn = await pg.connect_async() + workers.append(asyncio.create_task(update_table(pg_conn))) + + workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id))) + await asyncio.gather(*workers) + + assert updates_finished == updates_to_perform + + +# Detach and re-attach tenant, while compute is busy running queries. +# +# Some of the queries may fail, in the window that the tenant has been +# detached but not yet re-attached. But Postgres itself should keep +# running, and when we retry the queries, they should start working +# after the attach has finished. + +# FIXME: +# +# This is pretty unstable at the moment. I've seen it fail with a warning like this: +# +# AssertionError: assert not ['2023-01-05T13:09:40.708303Z WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n'] +# +# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/) +# I believe that failure happened because there is a race condition +# between detach and starting remote upload tasks: +# +# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown +# signal to all in-progress tasks associated with the tenant. +# 2. Just after shutdown_tasks() has collected the list of tasks, +# a new remote-upload task is spawned. +# +# See https://github.com/neondatabase/neon/issues/3273 +# +# +# I also saw this failure: +# +# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy +# asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id)) +# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run +# return loop.run_until_complete(main) +# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete +# return future.result() +# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy +# assert updates_finished == updates_to_perform +# E assert 5010 == 10010 +# E +5010 +# E -10010 +# +# I don't know what's causing that... +@pytest.mark.skip(reason="fixme") +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenant_reattach_while_busy( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_reattach_while_busy", + ) + env = neon_env_builder.init_start() + + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append( + ".*Tenant .* will not become active\\. Current state: Stopping.*" + ) + + pageserver_http = env.pageserver.http_client() + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant( + # Create layers aggressively + conf={"checkpoint_distance": "100000"} + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + cur = pg.connect().cursor() + + cur.execute("CREATE TABLE t(id int primary key, counter int)") + cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0") + + # Run the test + asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id)) + + # Verify table contents + assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows + assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform + + def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 77ec33f8b0..72d27c3aba 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1105,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env.pageserver.allowed_errors.extend( [ ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*", - ".*end streaming to Some.*", ] ) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 6c81756fe1..3aff839b81 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,7 +13,6 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] -ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] } @@ -37,13 +36,11 @@ prost = { version = "0.11", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } serde_json = { version = "1", features = ["raw_value", "std"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } -stable_deref_trait = { version = "1", features = ["alloc", "std"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } @@ -51,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] } url = { version = "2", features = ["serde"] } [build-dependencies] -ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } either = { version = "1", features = ["use_std"] }