From eb36403e71210b1be7e2482fc385b8da8c149d5f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 31 Jan 2023 14:06:35 +0100 Subject: [PATCH] Release 2023 01 31 (#3497) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kirill Bulatov Co-authored-by: Heikki Linnakangas Co-authored-by: Anastasia Lubennikova Co-authored-by: bojanserafimov Co-authored-by: Christian Schwarz Co-authored-by: Alexey Kondratov Co-authored-by: Joonas Koivunen Co-authored-by: Konstantin Knizhnik Co-authored-by: Shany Pozin Co-authored-by: Sergey Melnikov Co-authored-by: Dmitry Rodionov Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com> Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Lassi Pölönen --- .../actions/run-python-test-set/action.yml | 4 +- .github/ansible/deploy.yaml | 6 +- .../dev-us-east-2-beta.neon-proxy-link.yaml | 1 + ...prod-us-east-2-delta.neon-proxy-link.yaml} | 24 +- ...us-west-2-eta.neon-proxy-scram-legacy.yaml | 61 ++ .github/workflows/build_and_test.yml | 572 +++--------- .github/workflows/deploy-dev.yml | 179 ++++ .github/workflows/deploy-prod.yml | 277 ++++++ .github/workflows/release.yml | 33 + Cargo.lock | 328 ++++--- Cargo.toml | 10 +- ...ompute-node-v14 => Dockerfile.compute-node | 34 +- Dockerfile.compute-node-v15 | 220 ----- compute_tools/Cargo.toml | 3 + compute_tools/src/bin/compute_ctl.rs | 32 +- compute_tools/src/http/api.rs | 27 +- compute_tools/src/logger.rs | 24 +- compute_tools/src/params.rs | 8 +- compute_tools/src/spec.rs | 23 +- libs/metrics/src/lib.rs | 1 + libs/pageserver_api/src/models.rs | 37 +- libs/tracing-utils/Cargo.toml | 17 + libs/tracing-utils/src/http.rs | 96 ++ libs/tracing-utils/src/lib.rs | 168 ++++ libs/utils/Cargo.toml | 1 + libs/utils/src/http/error.rs | 17 +- libs/utils/src/logging.rs | 2 +- pageserver/Cargo.toml | 3 +- pageserver/benches/bench_layer_map.rs | 224 ++--- pageserver/src/basebackup.rs | 50 +- pageserver/src/bin/pageserver.rs | 57 +- pageserver/src/broker_client.rs | 48 + pageserver/src/config.rs | 28 + pageserver/src/consumption_metrics.rs | 24 +- pageserver/src/context.rs | 199 ++++ pageserver/src/http/openapi_spec.yml | 10 +- pageserver/src/http/routes.rs | 170 ++-- pageserver/src/import_datadir.rs | 52 +- pageserver/src/lib.rs | 3 +- pageserver/src/metrics.rs | 166 +++- pageserver/src/page_service.rs | 223 +++-- pageserver/src/pgdatadir_mapping.rs | 169 ++-- pageserver/src/repository.rs | 11 + pageserver/src/task_mgr.rs | 45 +- pageserver/src/tenant.rs | 576 ++++++------ pageserver/src/tenant/config.rs | 7 +- pageserver/src/tenant/layer_map.rs | 877 +++++++++--------- .../layer_map/historic_layer_coverage.rs | 583 ++++++++++++ .../src/tenant/layer_map/layer_coverage.rs | 154 +++ pageserver/src/tenant/mgr.rs | 236 +++-- .../src/tenant/remote_timeline_client.rs | 25 +- pageserver/src/tenant/size.rs | 16 +- pageserver/src/tenant/storage_layer.rs | 47 + pageserver/src/tenant/tasks.rs | 13 +- pageserver/src/tenant/timeline.rs | 327 +++++-- .../src/{ => tenant/timeline}/walreceiver.rs | 44 - .../walreceiver/connection_manager.rs | 55 +- .../walreceiver/walreceiver_connection.rs | 29 +- pageserver/src/walingest.rs | 418 ++++++--- pageserver/src/walredo.rs | 329 +++++-- poetry.lock | 247 ++++- proxy/src/main.rs | 4 +- pyproject.toml | 1 + safekeeper/src/bin/safekeeper.rs | 7 +- scripts/force_layer_download.py | 324 +++++++ storage_broker/src/bin/storage_broker.rs | 4 +- test_runner/fixtures/metrics.py | 12 +- test_runner/regress/test_tenant_conf.py | 55 +- test_runner/regress/test_tenant_detach.py | 46 +- test_runner/regress/test_tenants.py | 56 +- workspace_hack/Cargo.toml | 8 +- 71 files changed, 5779 insertions(+), 2408 deletions(-) rename .github/helm-values/{production.proxy.yaml => prod-us-east-2-delta.neon-proxy-link.yaml} (80%) create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml create mode 100644 .github/workflows/deploy-dev.yml create mode 100644 .github/workflows/deploy-prod.yml create mode 100644 .github/workflows/release.yml rename Dockerfile.compute-node-v14 => Dockerfile.compute-node (86%) delete mode 100644 Dockerfile.compute-node-v15 create mode 100644 libs/tracing-utils/Cargo.toml create mode 100644 libs/tracing-utils/src/http.rs create mode 100644 libs/tracing-utils/src/lib.rs create mode 100644 pageserver/src/broker_client.rs create mode 100644 pageserver/src/context.rs create mode 100644 pageserver/src/tenant/layer_map/historic_layer_coverage.rs create mode 100644 pageserver/src/tenant/layer_map/layer_coverage.rs rename pageserver/src/{ => tenant/timeline}/walreceiver.rs (83%) rename pageserver/src/{ => tenant/timeline}/walreceiver/connection_manager.rs (96%) rename pageserver/src/{ => tenant/timeline}/walreceiver/walreceiver_connection.rs (94%) create mode 100644 scripts/force_layer_download.py diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 990c7e25a9..29b04a3478 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -123,8 +123,8 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then - # -n4 uses four processes to run tests via pytest-xdist - EXTRA_PARAMS="-n4 $EXTRA_PARAMS" + # -n16 uses sixteen processes to run tests via pytest-xdist + EXTRA_PARAMS="-n16 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 4adc685684..a17dc9c78f 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -117,7 +117,8 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers tags: - pageserver @@ -186,6 +187,7 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers tags: - safekeeper diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index cb062f705d..157ae66ed1 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -8,6 +8,7 @@ settings: authBackend: "link" authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" + domain: "pg.neon.build" sentryEnvironment: "staging" metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" metricCollectionInterval: "1min" diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml similarity index 80% rename from .github/helm-values/production.proxy.yaml rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml index dbaf3cd096..eff24302bb 100644 --- a/.github/helm-values/production.proxy.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml @@ -1,37 +1,37 @@ +# Helm chart values for neon-proxy-link. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + settings: authBackend: "link" authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" + domain: "pg.neon.tech" sentryEnvironment: "production" # -- Additional labels for zenith-proxy pods podLabels: zenith_service: proxy zenith_env: production - zenith_region: us-west-2 - zenith_region_slug: oregon + zenith_region: us-east-2 + zenith_region_slug: us-east-2 service: + type: LoadBalancer annotations: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internal - external-dns.alpha.kubernetes.io/hostname: proxy-release.local - type: LoadBalancer + external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech exposedService: annotations: service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack + external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech extraManifests: - apiVersion: operator.victoriametrics.com/v1beta1 diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml new file mode 100644 index 0000000000..3a5cde4b01 --- /dev/null +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -0,0 +1,61 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://console-release.local/management/api/v2" + domain: "*.cloud.neon.tech" + sentryEnvironment: "production" + wssPort: 8443 + metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionInterval: "10min" + +# -- Additional labels for neon-proxy pods +podLabels: + zenith_service: proxy-scram + zenith_env: prod + zenith_region: us-west-2 + zenith_region_slug: us-west-2 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech + httpsPort: 443 + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-proxy.fullname\" . }}" + labels: + helm.sh/chart: neon-proxy-{{ .Chart.Version }} + app.kubernetes.io/name: neon-proxy + app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-proxy" + endpoints: + - port: http + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 918e568e27..89e12360f9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,4 +1,4 @@ -name: Test and Deploy +name: Build and Test on: push: @@ -19,10 +19,12 @@ concurrency: env: RUST_BACKTRACE: 1 COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: tag: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -50,7 +52,7 @@ jobs: id: build-tag check-codestyle-python: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned options: --init @@ -85,7 +87,7 @@ jobs: run: poetry run mypy . check-codestyle-rust: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -97,16 +99,16 @@ jobs: submodules: true fetch-depth: 1 - - name: Restore cargo deps cache - id: cache_cargo - uses: actions/cache@v3 - with: - path: | - ~/.cargo/registry/ - !~/.cargo/registry/src - ~/.cargo/git/ - target/ - key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# Disabled for now +# - name: Restore cargo deps cache +# id: cache_cargo +# uses: actions/cache@v3 +# with: +# path: | +# !~/.cargo/registry/src +# ~/.cargo/git/ +# target/ +# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -133,7 +135,7 @@ jobs: run: cargo deny check build-neon: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -141,7 +143,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - env: BUILD_TYPE: ${{ matrix.build_type }} GIT_VERSION: ${{ github.sha }} @@ -194,24 +195,26 @@ jobs: echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV + # Disabled for now # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache # directory, and it's faster to let 'cargo' to rebuild it from the # compressed crates. - - name: Cache cargo deps - id: cache_cargo - uses: actions/cache@v3 - with: - path: | - ~/.cargo/registry/ - !~/.cargo/registry/src - ~/.cargo/git/ - target/ - # Fall back to older versions of the key, if no cache for current Cargo.lock was found - key: | - v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} - v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- +# - name: Cache cargo deps +# id: cache_cargo +# uses: actions/cache@v3 +# with: +# path: | +# ~/.cargo/registry/ +# !~/.cargo/registry/src +# ~/.cargo/git/ +# target/ +# # Fall back to older versions of the key, if no cache for current Cargo.lock was found +# key: | +# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - name: Cache postgres v14 build id: cache_pg_14 @@ -301,7 +304,7 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -334,7 +337,7 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -365,7 +368,7 @@ jobs: # while coverage is currently collected for the debug ones merge-allure-report: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -402,7 +405,7 @@ jobs: DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json coverage-report: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -418,16 +421,17 @@ jobs: submodules: true fetch-depth: 1 - - name: Restore cargo deps cache - id: cache_cargo - uses: actions/cache@v3 - with: - path: | - ~/.cargo/registry/ - !~/.cargo/registry/src - ~/.cargo/git/ - target/ - key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# Disabled for now +# - name: Restore cargo deps cache +# id: cache_cargo +# uses: actions/cache@v3 +# with: +# path: | +# ~/.cargo/registry/ +# !~/.cargo/registry/src +# ~/.cargo/git/ +# target/ +# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -477,7 +481,7 @@ jobs: }" trigger-e2e-tests: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init @@ -522,9 +526,10 @@ jobs: }" neon-image: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - container: gcr.io/kaniko-project/executor:v1.9.0-debug + # https://github.com/GoogleContainerTools/kaniko/issues/2005 + container: gcr.io/kaniko-project/executor:v1.7.0-debug defaults: run: shell: sh -eu {0} @@ -540,12 +545,16 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build neon - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied + - name: Cleanup ECR folder + run: rm -rf ~/.ecr compute-tools-image: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - container: gcr.io/kaniko-project/executor:v1.9.0-debug + container: gcr.io/kaniko-project/executor:v1.7.0-debug defaults: run: shell: sh -eu {0} @@ -558,11 +567,14 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute tools - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + + - name: Cleanup ECR folder + run: rm -rf ~/.ecr compute-node-image: - runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.9.0-debug + runs-on: [ self-hosted, gen3, large ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug needs: [ tag ] strategy: fail-fast: false @@ -583,10 +595,13 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + + - name: Cleanup ECR folder + run: rm -rf ~/.ecr vm-compute-node-image: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, large ] needs: [ tag, compute-node-image ] strategy: fail-fast: false @@ -631,7 +646,7 @@ jobs: test-images: needs: [ tag, neon-image, compute-node-image, compute-tools-image ] - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] steps: - name: Checkout @@ -673,20 +688,39 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] needs: [ tag, test-images, vm-compute-node-image ] + container: golang:1.19-bullseye if: github.event_name != 'workflow_dispatch' - container: amazon/aws-cli - strategy: - fail-fast: false - matrix: - name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools] steps: - - name: Promote image to latest + - name: Install Crane & ECR helper + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' run: | - export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text) - aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Add latest tag to images + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + + - name: Cleanup ECR folder + run: rm -rf ~/.ecr push-docker-hub: runs-on: [ self-hosted, dev, x64 ] @@ -776,114 +810,11 @@ jobs: crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - calculate-deploy-targets: - runs-on: [ self-hosted, dev, x64 ] - if: | - github.ref_name == 'release' && - github.event_name != 'workflow_dispatch' - outputs: - matrix-include: ${{ steps.set-matrix.outputs.include }} - steps: - - id: set-matrix - run: | - if [[ "$GITHUB_REF_NAME" == "release" ]]; then - PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' - echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'" - exit 1 - fi - - deploy: - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. - # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] - if: | - github.ref_name == 'release' && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} - environment: - name: prod-old - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Redeploy - run: | - export DOCKER_TAG=${{needs.tag.outputs.build-tag}} - cd "$(pwd)/.github/ansible" - - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - ./get_binaries.sh - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - RELEASE=true ./get_binaries.sh - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - - eval $(ssh-agent) - echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key - echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater - ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - deploy-new: - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. - # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'main') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - target_region: [ eu-west-1, us-east-2 ] - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Redeploy - run: | - export DOCKER_TAG=${{needs.tag.outputs.build-tag}} - cd "$(pwd)/.github/ansible" - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - ./get_binaries.sh - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - RELEASE=true ./get_binaries.sh - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version + - name: Cleanup ECR folder + run: rm -rf ~/.ecr deploy-pr-test-new: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly @@ -915,311 +846,40 @@ jobs: ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version - deploy-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. - # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + - name: Cleanup ansible folder + run: rm -rf ~/.ansible + + deploy: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] - environment: - name: prod-${{ matrix.target_region }} + if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Checkout uses: actions/checkout@v3 with: - submodules: true + submodules: false fetch-depth: 0 - - name: Redeploy + - name: Trigger deploy workflow + env: + GH_TOKEN: ${{ github.token }} run: | - export DOCKER_TAG=${{needs.tag.outputs.build-tag}} - cd "$(pwd)/.github/ansible" - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - ./get_binaries.sh + gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - RELEASE=true ./get_binaries.sh + gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 fi - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - deploy-proxy: - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] - if: | - github.ref_name == 'release' && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} - environment: - name: prod-old - env: - KUBECONFIG: .kubeconfig - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Add curl - run: apt update && apt install curl -y - - - name: Store kubeconfig file - run: | - echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - - name: Setup helm v3 - run: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - - name: Re-deploy proxy - run: | - DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - deploy-storage-broker: - name: deploy storage broker on old staging and old prod - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] - if: | - github.ref_name == 'release' && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} - environment: - name: prod-old - env: - KUBECONFIG: .kubeconfig - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Add curl - run: apt update && apt install curl -y - - - name: Store kubeconfig file - run: | - echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - - name: Setup helm v3 - run: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - deploy-proxy-new: - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'main') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: dev-us-east-2-beta - deploy_link_proxy: true - deploy_legacy_scram_proxy: true - - target_region: eu-west-1 - target_cluster: dev-eu-west-1-zeta - deploy_link_proxy: false - deploy_legacy_scram_proxy: false - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Re-deploy scram proxy - run: | - DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy link proxy - if: matrix.deploy_link_proxy - run: | - DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy legacy scram proxy - if: matrix.deploy_legacy_scram_proxy - run: | - DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - deploy-storage-broker-dev-new: - runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'main') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: dev-us-east-2-beta - - target_region: eu-west-1 - target_cluster: dev-eu-west-1-zeta - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - deploy-proxy-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: prod-us-east-2-delta - - target_region: us-west-2 - target_cluster: prod-us-west-2-eta - - target_region: eu-central-1 - target_cluster: prod-eu-central-1-gamma - - target_region: ap-southeast-1 - target_cluster: prod-ap-southeast-1-epsilon - environment: - name: prod-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Re-deploy proxy - run: | - DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - deploy-storage-broker-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, tag, regress-tests ] - if: | - (github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: prod-us-east-2-delta - - target_region: us-west-2 - target_cluster: prod-us-west-2-eta - - target_region: eu-central-1 - target_cluster: prod-eu-central-1-gamma - - target_region: ap-southeast-1 - target_cluster: prod-ap-southeast-1-epsilon - environment: - name: prod-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - promote-compatibility-data: - runs-on: [ self-hosted, dev, x64 ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ deploy, deploy-proxy ] + needs: [ push-docker-hub, tag, regress-tests ] if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml new file mode 100644 index 0000000000..409517bf63 --- /dev/null +++ b/.github/workflows/deploy-dev.yml @@ -0,0 +1,179 @@ +name: Neon Deploy dev + +on: + workflow_dispatch: + inputs: + dockerTag: + description: 'Docker tag to deploy' + required: true + type: string + branch: + description: 'Branch or commit used for deploy scripts and configs' + required: true + type: string + default: 'main' + deployStorage: + description: 'Deploy storage' + required: true + type: boolean + default: true + deployProxy: + description: 'Deploy proxy' + required: true + type: boolean + default: true + deployStorageBroker: + description: 'Deploy storage-broker' + required: true + type: boolean + default: true + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +concurrency: + group: deploy-dev + cancel-in-progress: false + +jobs: + deploy-storage-new: + runs-on: [ self-hosted, gen3, small ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + options: --user root --privileged + if: inputs.deployStorage + defaults: + run: + shell: bash + strategy: + matrix: + target_region: [ eu-west-1, us-east-2 ] + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Redeploy + run: | + export DOCKER_TAG=${{ inputs.dockerTag }} + cd "$(pwd)/.github/ansible" + + ./get_binaries.sh + + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + rm -f neon_install.tar.gz .neon_current_version + + - name: Cleanup ansible folder + run: rm -rf ~/.ansible + + deploy-proxy-new: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployProxy + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta + deploy_link_proxy: true + deploy_legacy_scram_proxy: true + - target_region: eu-west-1 + target_cluster: dev-eu-west-1-zeta + deploy_link_proxy: false + deploy_legacy_scram_proxy: false + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + role-to-assume: arn:aws:iam::369495373322:role/github-runner + aws-region: eu-central-1 + role-skip-session-tagging: true + role-duration-seconds: 1800 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy scram proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Re-deploy link proxy + if: matrix.deploy_link_proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Re-deploy legacy scram proxy + if: matrix.deploy_legacy_scram_proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Cleanup helm folder + run: rm -rf ~/.cache + + deploy-storage-broker-new: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployStorageBroker + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta + - target_region: eu-west-1 + target_cluster: dev-eu-west-1-zeta + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + role-to-assume: arn:aws:iam::369495373322:role/github-runner + aws-region: eu-central-1 + role-skip-session-tagging: true + role-duration-seconds: 1800 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s + + - name: Cleanup helm folder + run: rm -rf ~/.cache diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml new file mode 100644 index 0000000000..e1954b5540 --- /dev/null +++ b/.github/workflows/deploy-prod.yml @@ -0,0 +1,277 @@ +name: Neon Deploy prod + +on: + workflow_dispatch: + inputs: + dockerTag: + description: 'Docker tag to deploy' + required: true + type: string + branch: + description: 'Branch or commit used for deploy scripts and configs' + required: true + type: string + default: 'main' + deployStorage: + description: 'Deploy storage' + required: true + type: boolean + default: true + deployProxy: + description: 'Deploy proxy' + required: true + type: boolean + default: true + deployStorageBroker: + description: 'Deploy storage-broker' + required: true + type: boolean + default: true + +concurrency: + group: deploy-prod + cancel-in-progress: false + +jobs: + deploy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + if: inputs.deployStorage + defaults: + run: + shell: bash + strategy: + matrix: + target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] + environment: + name: prod-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Redeploy + run: | + export DOCKER_TAG=${{ inputs.dockerTag }} + cd "$(pwd)/.github/ansible" + + ./get_binaries.sh + + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + rm -f neon_install.tar.gz .neon_current_version + + deploy-proxy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + if: inputs.deployProxy + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + deploy_link_proxy: true + deploy_legacy_scram_proxy: false + - target_region: us-west-2 + target_cluster: prod-us-west-2-eta + deploy_link_proxy: false + deploy_legacy_scram_proxy: true + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + deploy_link_proxy: false + deploy_legacy_scram_proxy: false + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + deploy_link_proxy: false + deploy_legacy_scram_proxy: false + environment: + name: prod-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy scram proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Re-deploy link proxy + if: matrix.deploy_link_proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Re-deploy legacy scram proxy + if: matrix.deploy_legacy_scram_proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + deploy-storage-broker-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + if: inputs.deployStorageBroker + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + - target_region: us-west-2 + target_cluster: prod-us-west-2-eta + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + environment: + name: prod-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s + + # Deploy to old account below + + deploy: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployStorage + defaults: + run: + shell: bash + environment: + name: prod-old + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Redeploy + run: | + export DOCKER_TAG=${{ inputs.dockerTag }} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + eval $(ssh-agent) + echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key + echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + rm -f neon_install.tar.gz .neon_current_version + + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied + - name: Cleanup ansible folder + run: rm -rf ~/.ansible + + deploy-proxy: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployProxy + defaults: + run: + shell: bash + environment: + name: prod-old + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Store kubeconfig file + run: | + echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Add neon helm chart + run: helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + - name: Cleanup helm folder + run: rm -rf ~/.cache + + deploy-storage-broker: + name: deploy storage broker on old staging and old prod + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployStorageBroker + defaults: + run: + shell: bash + environment: + name: prod-old + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Store kubeconfig file + run: | + echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Add neon helm chart + run: helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s + + - name: Cleanup helm folder + run: rm -rf ~/.cache diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000..49e04ee001 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,33 @@ +name: Create Release Branch + +on: + schedule: + - cron: '0 10 * * 2' + +jobs: + create_release_branch: + runs-on: [ubuntu-latest] + + steps: + - name: Check out code + uses: actions/checkout@v3 + with: + ref: main + + - name: Get current date + id: date + run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Create release branch + run: git checkout -b release/${{ steps.date.outputs.date }} + + - name: Push new branch + run: git push origin release/${{ steps.date.outputs.date }} + + - name: Create pull request into release + uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + head: release/${{ steps.date.outputs.date }} + base: release + title: Release ${{ steps.date.outputs.date }} diff --git a/Cargo.lock b/Cargo.lock index d8aba9ba68..2985a654f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,11 +37,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "amplify_num" -version = "0.4.1" -source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -66,6 +61,15 @@ dependencies = [ "backtrace", ] +[[package]] +name = "archery" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" +dependencies = [ + "static_assertions", +] + [[package]] name = "asn1-rs" version = "0.5.1" @@ -137,15 +141,6 @@ dependencies = [ "syn", ] -[[package]] -name = "atomic-polyfill" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28" -dependencies = [ - "critical-section", -] - [[package]] name = "atty" version = "0.2.14" @@ -629,9 +624,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.11.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" +checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" [[package]] name = "byteorder" @@ -750,13 +745,13 @@ dependencies = [ [[package]] name = "clap" -version = "4.0.32" +version = "4.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39" +checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2" dependencies = [ "bitflags", "clap_derive", - "clap_lex 0.3.0", + "clap_lex 0.3.1", "is-terminal", "once_cell", "strsim", @@ -765,9 +760,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.0.21" +version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014" +checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" dependencies = [ "heck", "proc-macro-error", @@ -787,9 +782,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" dependencies = [ "os_str_bytes", ] @@ -832,10 +827,11 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.0.32", + "clap 4.1.1", "futures", "hyper", "notify", + "opentelemetry", "postgres", "regex", "serde", @@ -844,7 +840,9 @@ dependencies = [ "tokio", "tokio-postgres", "tracing", + "tracing-opentelemetry", "tracing-subscriber", + "tracing-utils", "url", "workspace_hack", ] @@ -887,7 +885,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.0.32", + "clap 4.1.1", "comfy-table", "git-version", "nix", @@ -988,12 +986,6 @@ dependencies = [ "itertools", ] -[[package]] -name = "critical-section" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" - [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -1030,12 +1022,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.11" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" dependencies = [ "cfg-if", - "once_cell", ] [[package]] @@ -1152,6 +1143,19 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" +dependencies = [ + "cfg-if", + "hashbrown 0.12.3", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "data-encoding" version = "2.3.3" @@ -1506,15 +1510,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hash32" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" -dependencies = [ - "byteorder", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -1530,19 +1525,6 @@ dependencies = [ "ahash", ] -[[package]] -name = "heapless" -version = "0.7.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" -dependencies = [ - "atomic-polyfill", - "hash32", - "rustc_version", - "spin 0.9.4", - "stable_deref_trait", -] - [[package]] name = "heck" version = "0.4.0" @@ -1804,9 +1786,9 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" dependencies = [ "libc", "windows-sys", @@ -1916,12 +1898,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "libm" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" - [[package]] name = "link-cplusplus" version = "1.0.8" @@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] name = "nix" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags", "cfg-if", @@ -2081,9 +2057,9 @@ dependencies = [ [[package]] name = "nom" -version = "7.1.2" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" dependencies = [ "memchr", "minimal-lexical", @@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -2203,6 +2178,108 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-http" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry_api", + "reqwest", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde" +dependencies = [ + "async-trait", + "futures", + "futures-util", + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "prost", + "reqwest", + "thiserror", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28" +dependencies = [ + "futures", + "futures-util", + "opentelemetry", + "prost", + "tonic", + "tonic-build", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry_api" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22" +dependencies = [ + "fnv", + "futures-channel", + "futures-util", + "indexmap", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113" +dependencies = [ + "async-trait", + "crossbeam-channel", + "dashmap", + "fnv", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api", + "percent-encoding", + "rand", + "thiserror", + "tokio", + "tokio-stream", +] + [[package]] name = "os_info" version = "3.5.1" @@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" name = "pageserver" version = "0.1.0" dependencies = [ - "amplify_num", "anyhow", "async-stream", "async-trait", "byteorder", "bytes", "chrono", - "clap 4.0.32", + "clap 4.1.1", "close_fds", "const_format", "consumption_metrics", @@ -2269,7 +2345,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", - "rstar", + "rpds", "scopeguard", "serde", "serde_json", @@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.49" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" dependencies = [ "unicode-ident", ] @@ -2683,7 +2759,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.0.32", + "clap 4.1.1", "consumption_metrics", "futures", "git-version", @@ -2742,14 +2818,13 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", - "rand_hc", ] [[package]] @@ -2771,15 +2846,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core", -] - [[package]] name = "rayon" version = "1.6.1" @@ -2930,7 +2996,7 @@ dependencies = [ "cc", "libc", "once_cell", - "spin 0.5.2", + "spin", "untrusted", "web-sys", "winapi", @@ -2950,14 +3016,12 @@ dependencies = [ ] [[package]] -name = "rstar" -version = "0.9.3" +name = "rpds" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" dependencies = [ - "heapless", - "num-traits", - "smallvec", + "archery", ] [[package]] @@ -3018,9 +3082,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.6" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549" +checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" dependencies = [ "bitflags", "errno", @@ -3093,7 +3157,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 4.0.32", + "clap 4.1.1", "const_format", "crc32c", "fs2", @@ -3479,21 +3543,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" -[[package]] -name = "spin" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" -dependencies = [ - "lock_api", -] - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - [[package]] name = "static_assertions" version = "1.1.0" @@ -3507,7 +3556,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.0.32", + "clap 4.1.1", "const_format", "futures", "futures-core", @@ -3639,9 +3688,9 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.3" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" dependencies = [ "winapi-util", ] @@ -3749,9 +3798,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.24.1" +version = "1.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae" +checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb" dependencies = [ "autocfg", "bytes", @@ -4071,6 +4120,20 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de" +dependencies = [ + "once_cell", + "opentelemetry", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", +] + [[package]] name = "tracing-serde" version = "0.1.3" @@ -4102,6 +4165,22 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tracing-utils" +version = "0.1.0" +dependencies = [ + "hyper", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-semantic-conventions", + "reqwest", + "tokio", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", + "workspace_hack", +] + [[package]] name = "try-lock" version = "0.2.4" @@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "ureq" -version = "2.6.1" +version = "2.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566" +checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d" dependencies = [ "base64 0.13.1", "log", @@ -4226,6 +4305,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "atty", "bincode", "byteorder", "bytes", @@ -4287,7 +4367,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.0.32", + "clap 4.1.1", "env_logger", "log", "once_cell", @@ -4534,11 +4614,13 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap 4.0.32", + "clap 4.1.1", "crossbeam-utils", "either", "fail", + "futures", "futures-channel", + "futures-executor", "futures-task", "futures-util", "indexmap", @@ -4554,6 +4636,9 @@ dependencies = [ "rand", "regex", "regex-syntax", + "reqwest", + "ring", + "rustls", "scopeguard", "serde", "serde_json", @@ -4561,6 +4646,7 @@ dependencies = [ "syn", "tokio", "tokio-util", + "tonic", "tower", "tracing", "tracing-core", diff --git a/Cargo.toml b/Cargo.toml index 74cc16d690..e6695c4246 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,10 @@ nix = "0.26" notify = "5.0.0" num-traits = "0.2.15" once_cell = "1.13" +opentelemetry = "0.18.0" +opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.10.0" +tracing-opentelemetry = "0.18.0" parking_lot = "0.12" pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency @@ -69,7 +73,7 @@ rand = "0.8" regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } routerify = "3" -rstar = "0.9.3" +rpds = "0.12.0" rustls = "0.20" rustls-pemfile = "1" rustls-split = "0.3" @@ -107,9 +111,6 @@ x509-parser = "0.14" env_logger = "0.10" log = "0.4" -## TODO switch when the new release is made -amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" } - ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } @@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } +tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } ## Common library dependency diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node similarity index 86% rename from Dockerfile.compute-node-v14 rename to Dockerfile.compute-node index 2deb95a93f..936f368833 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node @@ -1,8 +1,5 @@ -# -# This file is identical to the Dockerfile.compute-node-v15 file -# except for the version of Postgres that is built. -# - +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust ARG TAG=pinned ######################################################################################### @@ -22,7 +19,8 @@ RUN apt update && \ # ######################################################################################### FROM build-deps AS pg-build -COPY vendor/postgres-v14 postgres +ARG PG_VERSION +COPY vendor/postgres-${PG_VERSION} postgres RUN cd postgres && \ ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ @@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control +######################################################################################### +# +# Layer "unit-pg-build" +# compile unit extension +# +######################################################################################### +FROM build-deps AS unit-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \ + tar xvzf 7.7.tar.gz && \ + cd postgresql-unit-7.7 && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + # unit extension's "create extension" script relies on absolute install path to fill some reference tables. + # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. + # This one-liner removes pgsql/ part of the path. + # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix. + find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / +COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ # Compile and run the Neon-specific `compute_ctl` binary # ######################################################################################### -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 deleted file mode 100644 index 8647ce2bf4..0000000000 --- a/Dockerfile.compute-node-v15 +++ /dev/null @@ -1,220 +0,0 @@ -# -# This file is identical to the Dockerfile.compute-node-v14 file -# except for the version of Postgres that is built. -# - -ARG TAG=pinned - -######################################################################################### -# -# Layer "build-deps" -# -######################################################################################### -FROM debian:bullseye-slim AS build-deps -RUN apt update && \ - apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ - zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev - -######################################################################################### -# -# Layer "pg-build" -# Build Postgres from the neon postgres repository. -# -######################################################################################### -FROM build-deps AS pg-build -COPY vendor/postgres-v15 postgres -RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ - # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ - # Enable some of contrib extensions - echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control - -######################################################################################### -# -# Layer "postgis-build" -# Build PostGIS from the upstream PostGIS mirror. -# -######################################################################################### -FROM build-deps AS postgis-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc - -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ - tar xvzf postgis-3.3.1.tar.gz && \ - cd postgis-3.3.1 && \ - ./autogen.sh && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ - ./configure && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - cd extensions/postgis && \ - make clean && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control - -######################################################################################### -# -# Layer "plv8-build" -# Build plv8 -# -######################################################################################### -FROM build-deps AS plv8-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils - -# https://github.com/plv8/plv8/issues/475: -# v8 uses gold for linking and sets `--thread-count=4` which breaks -# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) -# Install newer gold version manually as debian-testing binutils version updates -# libc version, which in turn breaks other extension built against non-testing libc. -RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ - tar xvzf binutils-2.38.tar.gz && \ - cd binutils-2.38 && \ - cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ - cd ../bfd && ./configure && make bfdver.h && \ - cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ - cp /usr/local/bin/ld.gold /usr/bin/gold - -# Sed is used to patch for https://github.com/plv8/plv8/issues/503 -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ - tar xvzf v3.1.4.tar.gz && \ - cd plv8-3.1.4 && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ - sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ - make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ - rm -rf /plv8-* && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control - -######################################################################################### -# -# Layer "h3-pg-build" -# Build h3_pg -# -######################################################################################### -FROM build-deps AS h3-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -# packaged cmake is too old -RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ - -q -O /tmp/cmake-install.sh \ - && chmod u+x /tmp/cmake-install.sh \ - && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ - && rm /tmp/cmake-install.sh - -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ - tar xvzf h3.tgz && \ - cd h3-4.0.1 && \ - mkdir build && \ - cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/h3 make install && \ - cp -R /h3/usr / && \ - rm -rf build - -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ - tar xvzf h3-pg.tgz && \ - cd h3-pg-4.0.1 && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control - -######################################################################################### -# -# Layer "neon-pg-ext-build" -# compile neon extensions -# -######################################################################################### -FROM build-deps AS neon-pg-ext-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /h3/usr / -COPY pgxn/ pgxn/ - -RUN make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ - -C pgxn/neon \ - -s install - -######################################################################################### -# -# Compile and run the Neon-specific `compute_ctl` binary -# -######################################################################################### -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools -USER nonroot -# Copy entire project to get Cargo.* files with proper dependencies for the whole project -COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto - -######################################################################################### -# -# Clean up postgres folder before inclusion -# -######################################################################################### -FROM neon-pg-ext-build AS postgres-cleanup-layer -COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql - -# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp - -# Remove headers that we won't need anymore - we've completed installation of all extensions -RUN rm -r /usr/local/pgsql/include - -# Remove static postgresql libraries - all compilation is finished, so we -# can now remove these files - they must be included in other binaries by now -# if they were to be used by other libraries. -RUN rm /usr/local/pgsql/lib/lib*.a - -######################################################################################### -# -# Final layer -# Put it all together into the final image -# -######################################################################################### -FROM debian:bullseye-slim -# Add user postgres -RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ - echo "postgres:test_console_pass" | chpasswd && \ - mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ - chown -R postgres:postgres /var/db/postgres && \ - chmod 0750 /var/db/postgres/compute && \ - echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig - -COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl - -# Install: -# libreadline8 for psql -# libossp-uuid16 for extension ossp-uuid -# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS -RUN apt update && \ - apt install --no-install-recommends -y \ - libreadline8 \ - libossp-uuid16 \ - libgeos-c1v5 \ - libgdal28 \ - libproj19 \ - libprotobuf-c1 \ - gdb && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -USER postgres -ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 4536604bdf..f8c3481f57 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,6 +11,7 @@ clap.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } notify.workspace = true +opentelemetry.workspace = true postgres.workspace = true regex.workspace = true serde.workspace = true @@ -19,7 +20,9 @@ tar.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +tracing-utils.workspace = true url.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e5ab8eb153..2c42662020 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -53,7 +53,7 @@ use compute_tools::spec::*; use url::Url; fn main() -> Result<()> { - init_logger(DEFAULT_LOG_LEVEL)?; + init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let matches = cli().get_matches(); @@ -84,6 +84,29 @@ fn main() -> Result<()> { } }; + // Extract OpenTelemetry context for the startup actions from the spec, and + // attach it to the current tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry::sdk::propagation::TraceContextPropagator; + Some(TraceContextPropagator::new().extract(carrier).attach()) + } else { + None + }; + let pageserver_connstr = spec .cluster .settings @@ -140,6 +163,9 @@ fn main() -> Result<()> { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. if let Some(mut pg) = pg { + // Startup is finished, exit the startup tracing span + drop(startup_context_guard); + let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); @@ -159,6 +185,10 @@ fn main() -> Result<()> { info!("shutting down"); } + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing_utils::shutdown_tracing(); + exit(exit_code.unwrap_or(1)) } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index f2a49f332c..589a8e1434 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,16 +3,21 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::compute::ComputeNode; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use serde_json; use tracing::{error, info}; - -use crate::compute::ComputeNode; +use tracing_utils::http::OtelName; // Service function to handle all available routes. -async fn routes(req: Request, compute: Arc) -> Response { +async fn routes(req: Request, compute: &Arc) -> Response { + // + // NOTE: The URI path is currently included in traces. That's OK because + // it doesn't contain any variable parts or sensitive information. But + // please keep that in mind if you change the routing here. + // match (req.method(), req.uri().path()) { // Serialized compute state. (&Method::GET, "/status") => { @@ -30,7 +35,7 @@ async fn routes(req: Request, compute: Arc) -> Response (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); - let res = crate::checker::check_writability(&compute).await; + let res = crate::checker::check_writability(compute).await; match res { Ok(_) => Response::new(Body::from("true")), Err(e) => Response::new(Body::from(e.to_string())), @@ -56,7 +61,19 @@ async fn serve(state: Arc) { async move { Ok::<_, Infallible>(service_fn(move |req: Request| { let state = state.clone(); - async move { Ok::<_, Infallible>(routes(req, state).await) } + async move { + Ok::<_, Infallible>( + // NOTE: We include the URI path in the string. It + // doesn't contain any variable parts or sensitive + // information in this API. + tracing_utils::http::tracing_handler( + req, + |req| routes(req, &state), + OtelName::UriPath, + ) + .await, + ) + } })) } }); diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 57e5496e86..1b5cf647b0 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,21 +1,37 @@ -use anyhow::Result; +use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; -/// Initialize `env_logger` using either `default_level` or +/// Initialize logging to stderr, and OpenTelemetry tracing and exporter. +/// +/// Logging is configured using either `default_log_level` or /// `RUST_LOG` environment variable as default log level. -pub fn init_logger(default_level: &str) -> Result<()> { +/// +/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up +/// configuration from environment variables. For example, to change the destination, +/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See +/// `tracing-utils` package description. +/// +pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { + // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level)); + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); let fmt_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_writer(std::io::stderr); + // Initialize OpenTelemetry + let otlp_layer = + tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new); + + // Put it all together tracing_subscriber::registry() .with(env_filter) + .with(otlp_layer) .with(fmt_layer) .init(); + tracing::info!("logging and tracing started"); Ok(()) } diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs index 925a2f8ef3..0ce01ff478 100644 --- a/compute_tools/src/params.rs +++ b/compute_tools/src/params.rs @@ -1,3 +1,9 @@ pub const DEFAULT_LOG_LEVEL: &str = "info"; -pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres"; +// From Postgres docs: +// To ease transition from the md5 method to the newer SCRAM method, if md5 is specified +// as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM +// (see below), then SCRAM-based authentication will automatically be chosen instead. +// https://www.postgresql.org/docs/15/auth-password.html +// +// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles. pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5"; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 97cd623052..bbd0ec21ed 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::path::Path; use std::str::FromStr; @@ -22,6 +23,8 @@ pub struct ComputeSpec { /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, + + pub startup_tracing_context: Option>, } /// Cluster state seen from the perspective of the external tools @@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { { RoleAction::Update } else if let Some(pg_pwd) = &r.encrypted_password { - // Check whether password changed or not (trim 'md5:' prefix first) - if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() { + // Check whether password changed or not (trim 'md5' prefix first if any) + // + // This is a backward compatibility hack, which comes from the times when we were using + // md5 for everyone and hashes were stored in the console db without md5 prefix. So when + // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix, + // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix. + // Here is the only place so far where we compare hashes, so it seems to be the best candidate + // to place this compatibility layer. + let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") { + stripped + } else { + pg_pwd + }; + if pg_pwd != *role.encrypted_password.as_ref().unwrap() { RoleAction::Update } else { RoleAction::None @@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { name.pg_quote(), db.owner.pg_quote() ); - let _ = info_span!("executing", query).entered(); + let _guard = info_span!("executing", query).entered(); client.execute(query.as_str(), &[])?; } DatabaseAction::Create => { let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); query.push_str(&db.to_pg_options()); - let _ = info_span!("executing", query).entered(); + let _guard = info_span!("executing", query).entered(); client.execute(query.as_str(), &[])?; } }; diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 880ab0e83c..07d220195b 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -8,6 +8,7 @@ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; +pub use prometheus::{register_counter_vec, Counter, CounterVec}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; pub use prometheus::{register_histogram, Histogram}; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b5027cb331..0d7aa2db55 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -29,6 +29,14 @@ pub enum TenantState { Broken, } +pub mod state { + pub const LOADING: &str = "loading"; + pub const ATTACHING: &str = "attaching"; + pub const ACTIVE: &str = "active"; + pub const STOPPING: &str = "stopping"; + pub const BROKEN: &str = "broken"; +} + impl TenantState { pub fn has_in_progress_downloads(&self) -> bool { match self { @@ -39,23 +47,32 @@ impl TenantState { Self::Broken => false, } } + + pub fn as_str(&self) -> &'static str { + match self { + TenantState::Loading => state::LOADING, + TenantState::Attaching => state::ATTACHING, + TenantState::Active => state::ACTIVE, + TenantState::Stopping => state::STOPPING, + TenantState::Broken => state::BROKEN, + } + } } /// A state of a timeline in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TimelineState { - /// Timeline is fully operational. If the containing Tenant is Active, the timeline's - /// background jobs are running otherwise they will be launched when the tenant is activated. + /// The timeline is recognized by the pageserver but is not yet operational. + /// In particular, the walreceiver connection loop is not running for this timeline. + /// It will eventually transition to state Active or Broken. + Loading, + /// The timeline is fully operational. + /// It can be queried, and the walreceiver connection loop is running. Active, - /// A timeline is recognized by pageserver, but not yet ready to operate. - /// The status indicates, that the timeline could eventually go back to Active automatically: - /// for example, if the owning tenant goes back to Active again. - Suspended, - /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to - /// automatically become Active after certain events: only a management call can change this status. + /// The timeline was previously Loading or Active but is shutting down. + /// It cannot transition back into any other state. Stopping, - /// A timeline is recognized by the pageserver, but can no longer be used for - /// any operations, because it failed to be activated. + /// The timeline is broken and not operational (previous states: Loading or Active). Broken, } diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml new file mode 100644 index 0000000000..8c3d3f9063 --- /dev/null +++ b/libs/tracing-utils/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "tracing-utils" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +hyper.workspace = true +opentelemetry = { workspace = true, features=["rt-tokio"] } +opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions.workspace = true +reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } +tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } +tracing.workspace = true +tracing-opentelemetry.workspace = true +tracing-subscriber.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs new file mode 100644 index 0000000000..3f80f49de1 --- /dev/null +++ b/libs/tracing-utils/src/http.rs @@ -0,0 +1,96 @@ +//! Tracing wrapper for Hyper HTTP server + +use hyper::HeaderMap; +use hyper::{Body, Request, Response}; +use std::future::Future; +use tracing::Instrument; +use tracing_opentelemetry::OpenTelemetrySpanExt; + +/// Configuration option for what to use as the "otel.name" field in the traces. +pub enum OtelName<'a> { + /// Use a constant string + Constant(&'a str), + + /// Use the path from the request. + /// + /// That's very useful information, but is not appropriate if the + /// path contains parameters that differ on ever request, or worse, + /// sensitive information like usernames or email addresses. + /// + /// See + UriPath, +} + +/// Handle an incoming HTTP request using the given handler function, +/// with OpenTelemetry tracing. +/// +/// This runs 'handler' on the request in a new span, with fields filled in +/// from the request. Notably, if the request contains tracing information, +/// it is propagated to the span, so that this request is traced as part of +/// the same trace. +/// +/// XXX: Usually, this is handled by existing libraries, or built +/// directly into HTTP servers. However, I couldn't find one for Hyper, +/// so I had to write our own. OpenTelemetry website has a registry of +/// instrumentation libraries at: +/// https://opentelemetry.io/registry/?language=rust&component=instrumentation +/// If a Hyper crate appears, consider switching to that. +pub async fn tracing_handler( + req: Request, + handler: F, + otel_name: OtelName<'_>, +) -> Response +where + F: Fn(Request) -> R, + R: Future>, +{ + // Create a tracing span, with context propagated from the incoming + // request if any. + // + // See list of standard fields defined for HTTP requests at + // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md + // We only fill in a few of the most useful ones here. + let otel_name = match otel_name { + OtelName::Constant(s) => s, + OtelName::UriPath => req.uri().path(), + }; + + let span = tracing::info_span!( + "http request", + otel.name= %otel_name, + http.method = %req.method(), + http.status_code = tracing::field::Empty, + ); + let parent_ctx = extract_remote_context(req.headers()); + span.set_parent(parent_ctx); + + // Handle the request within the span + let response = handler(req).instrument(span.clone()).await; + + // Fill in the fields from the response code + let status = response.status(); + span.record("http.status_code", status.as_str()); + span.record( + "otel.status_code", + if status.is_success() { "OK" } else { "ERROR" }, + ); + + response +} + +// Extract remote tracing context from the HTTP headers +fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context { + struct HeaderExtractor<'a>(&'a HeaderMap); + + impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> { + fn get(&self, key: &str) -> Option<&str> { + self.0.get(key).and_then(|value| value.to_str().ok()) + } + + fn keys(&self) -> Vec<&str> { + self.0.keys().map(|value| value.as_str()).collect() + } + } + let extractor = HeaderExtractor(headers); + opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor)) +} diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs new file mode 100644 index 0000000000..de0e2ad799 --- /dev/null +++ b/libs/tracing-utils/src/lib.rs @@ -0,0 +1,168 @@ +//! Helper functions to set up OpenTelemetry tracing. +//! +//! This comes in two variants, depending on whether you have a Tokio runtime available. +//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use +//! the current tokio runtime. If you don't have a runtime available, or you don't want +//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()` +//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks. +//! +//! Example: +//! +//! ```rust,no_run +//! use tracing_subscriber::prelude::*; +//! use tracing_opentelemetry::OpenTelemetryLayer; +//! +//! #[tokio::main] +//! async fn main() { +//! // Set up logging to stderr +//! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() +//! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")); +//! let fmt_layer = tracing_subscriber::fmt::layer() +//! .with_target(false) +//! .with_writer(std::io::stderr); +//! +//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces +//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new); +//! +//! // Put it all together +//! tracing_subscriber::registry() +//! .with(env_filter) +//! .with(otlp_layer) +//! .with(fmt_layer) +//! .init(); +//! } +//! ``` + +use opentelemetry::sdk::Resource; +use opentelemetry::KeyValue; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}; + +pub use tracing_opentelemetry::OpenTelemetryLayer; + +pub mod http; + +/// Set up OpenTelemetry exporter, using configuration from environment variables. +/// +/// `service_name` is set as the OpenTelemetry 'service.name' resource (see +/// ) +/// +/// We try to follow the conventions for the environment variables specified in +/// +/// +/// However, we only support a subset of those options: +/// +/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing +/// is enabled by default. Set it to "true" to disable. +/// +/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_* +/// settings specified in +/// +/// are supported, as they are handled by the `opentelemetry-otlp` crate. +/// Settings related to other exporters have no effect. +/// +/// - Some other settings are supported by the `opentelemetry` crate. +/// +/// If you need some other setting, please test if it works first. And perhaps +/// add a comment in the list above to save the effort of testing for the next +/// person. +/// +/// This doesn't block, but is marked as 'async' to hint that this must be called in +/// asynchronous execution context. +pub async fn init_tracing(service_name: &str) -> Option { + if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { + return None; + }; + Some(init_tracing_internal(service_name.to_string())) +} + +/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing +/// tasks. +pub fn init_tracing_without_runtime( + service_name: &str, +) -> Option { + if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { + return None; + }; + + // The opentelemetry batch processor and the OTLP exporter needs a Tokio + // runtime. Create a dedicated runtime for them. One thread should be + // enough. + // + // (Alternatively, instead of batching, we could use the "simple + // processor", which doesn't need Tokio, and use "reqwest-blocking" + // feature for the OTLP exporter, which also doesn't need Tokio. However, + // batching is considered best practice, and also I have the feeling that + // the non-Tokio codepaths in the opentelemetry crate are less used and + // might be more buggy, so better to stay on the well-beaten path.) + // + // We leak the runtime so that it keeps running after we exit the + // function. + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .thread_name("otlp runtime thread") + .worker_threads(1) + .build() + .unwrap(), + )); + let _guard = runtime.enter(); + + Some(init_tracing_internal(service_name.to_string())) +} + +fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer { + // Set up exporter from the OTEL_EXPORTER_* environment variables + let mut exporter = opentelemetry_otlp::new_exporter().http().with_env(); + + // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the + // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the + // OpenTelemetry spec at + // , + // the full exporter URL is formed by appending "/v1/traces" to the value + // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does + // that with the grpc-tonic exporter. Other exporters, like the HTTP + // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without + // appending "/v1/traces". + // + // See https://github.com/open-telemetry/opentelemetry-rust/pull/950 + // + // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting + // the endpoint url with the "/v1/traces" path ourselves. If the bug is + // fixed in a later version, we can remove this code. But if we don't + // remember to remove this, it won't do any harm either, as the crate will + // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint + // is set directly with `with_endpoint`. + if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() { + if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) { + if !endpoint.ends_with('/') { + endpoint.push('/'); + } + endpoint.push_str("v1/traces"); + exporter = exporter.with_endpoint(endpoint); + } + } + + // Propagate trace information in the standard W3C TraceContext format. + opentelemetry::global::set_text_map_propagator( + opentelemetry::sdk::propagation::TraceContextPropagator::new(), + ); + + opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter(exporter) + .with_trace_config( + opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])), + ) + .install_batch(opentelemetry::runtime::Tokio) + .expect("could not initialize opentelemetry exporter") +} + +// Shutdown trace pipeline gracefully, so that it has a chance to send any +// pending traces before we exit. +pub fn shutdown_tracing() { + opentelemetry::global::shutdown_tracer_provider(); +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 020e4d9dd7..1f6c96bdbe 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +atty.workspace = true sentry.workspace = true async-trait.workspace = true anyhow.workspace = true diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index b0ecb746d9..1ba0422993 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,6 +1,7 @@ use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; +use tracing::error; #[derive(Debug, Error)] pub enum ApiError { @@ -76,8 +77,16 @@ impl HttpErrorBody { } pub async fn handler(err: routerify::RouteError) -> Response { - tracing::error!("Error processing HTTP request: {:?}", err); - err.downcast::() - .expect("handler should always return api error") - .into_response() + let api_error = err + .downcast::() + .expect("handler should always return api error"); + + // Print a stack trace for Internal Server errors + if let ApiError::InternalServerError(_) = api_error.as_ref() { + error!("Error processing HTTP request: {api_error:?}"); + } else { + error!("Error processing HTTP request: {api_error:#}"); + } + + api_error.into_response() } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 82c9267f4a..02684d3d16 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> { let base_logger = tracing_subscriber::fmt() .with_env_filter(env_filter) .with_target(false) - .with_ansi(false) + .with_ansi(atty::is(atty::Stream::Stdout)) .with_writer(std::io::stdout); match log_format { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index cb9e4478bf..66c25e8576 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -11,7 +11,6 @@ default = [] testing = ["fail/failpoints"] [dependencies] -amplify_num.workspace = true anyhow.workspace = true async-stream.workspace = true async-trait.workspace = true @@ -41,7 +40,6 @@ postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true regex.workspace = true -rstar.workspace = true scopeguard.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } @@ -68,6 +66,7 @@ tenant_size_model.workspace = true utils.workspace = true workspace_hack.workspace = true reqwest.workspace = true +rpds.workspace = true [dev-dependencies] criterion.workspace = true diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 6a01fdfc6f..e18c00da96 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,13 +1,12 @@ -use anyhow::Result; +use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState}; -use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult}; +use pageserver::tenant::storage_layer::Layer; +use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor}; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; -use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -17,102 +16,35 @@ use utils::lsn::Lsn; use criterion::{criterion_group, criterion_main, Criterion}; -struct DummyDelta { - key_range: Range, - lsn_range: Range, -} - -impl Layer for DummyDelta { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - self.lsn_range.clone() - } - fn get_value_reconstruct_data( - &self, - _key: Key, - _lsn_range: Range, - _reconstruct_data: &mut ValueReconstructState, - ) -> Result { - panic!() - } - - fn is_incremental(&self) -> bool { - true - } - - fn dump(&self, _verbose: bool) -> Result<()> { - unimplemented!() - } - - fn short_id(&self) -> String { - unimplemented!() - } -} - -struct DummyImage { - key_range: Range, - lsn: Lsn, -} - -impl Layer for DummyImage { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - // End-bound is exclusive - self.lsn..(self.lsn + 1) - } - - fn get_value_reconstruct_data( - &self, - _key: Key, - _lsn_range: Range, - _reconstruct_data: &mut ValueReconstructState, - ) -> Result { - panic!() - } - - fn is_incremental(&self) -> bool { - false - } - - fn dump(&self, _verbose: bool) -> Result<()> { - unimplemented!() - } - - fn short_id(&self) -> String { - unimplemented!() - } -} - -fn build_layer_map(filename_dump: PathBuf) -> LayerMap { - let mut layer_map = LayerMap::::default(); +fn build_layer_map(filename_dump: PathBuf) -> LayerMap { + let mut layer_map = LayerMap::::default(); let mut min_lsn = Lsn(u64::MAX); let mut max_lsn = Lsn(0); let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines(); + let mut updates = layer_map.batch_update(); for fname in filenames { let fname = &fname.unwrap(); if let Some(imgfilename) = ImageFileName::parse_str(fname) { - let layer = DummyImage { - key_range: imgfilename.key_range, - lsn: imgfilename.lsn, + let layer = LayerDescriptor { + key: imgfilename.key_range, + lsn: imgfilename.lsn..(imgfilename.lsn + 1), + is_incremental: false, + short_id: fname.to_string(), }; - layer_map.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)); min_lsn = min(min_lsn, imgfilename.lsn); max_lsn = max(max_lsn, imgfilename.lsn); } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { - let layer = DummyDelta { - key_range: deltafilename.key_range, - lsn_range: deltafilename.lsn_range.clone(), + let layer = LayerDescriptor { + key: deltafilename.key_range.clone(), + lsn: deltafilename.lsn_range.clone(), + is_incremental: true, + short_id: fname.to_string(), }; - layer_map.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)); min_lsn = min(min_lsn, deltafilename.lsn_range.start); max_lsn = max(max_lsn, deltafilename.lsn_range.end); } else { @@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { println!("min: {min_lsn}, max: {max_lsn}"); + updates.flush(); layer_map } /// Construct a layer map query pattern for benchmarks -fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { +fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { // For each image layer we query one of the pages contained, at LSN right // before the image layer was created. This gives us a somewhat uniform // coverage of both the lsn and key space because image layers have @@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { .collect() } +// Construct a partitioning for testing get_difficulty map when we +// don't have an exact result of `collect_keyspace` to work with. +fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { + let mut parts = Vec::new(); + + // We add a partition boundary at the start of each image layer, + // no matter what lsn range it covers. This is just the easiest + // thing to do. A better thing to do would be to get a real + // partitioning from some database. Even better, remove the need + // for key partitions by deciding where to create image layers + // directly based on a coverage-based difficulty map. + let mut keys: Vec<_> = layer_map + .iter_historic_layers() + .filter_map(|l| { + if l.is_incremental() { + None + } else { + let kr = l.get_key_range(); + Some(kr.start.next()) + } + }) + .collect(); + keys.sort(); + + let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap(); + for key in keys { + parts.push(KeySpace { + ranges: vec![current_key..key], + }); + current_key = key; + } + + KeyPartitioning { parts } +} + // Benchmark using metadata extracted from our performance test environment, from // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. @@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) { // Benchmark using metadata extracted from a real project that was taknig // too long processing layer map queries. fn bench_from_real_project(c: &mut Criterion) { - // TODO consider compressing this file + // Init layer map + let now = Instant::now(); let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + println!("Finished layer map init in {:?}", now.elapsed()); + + // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Test with uniform query pattern - c.bench_function("real_map_uniform_queries", |b| { + // Choose inputs for get_difficulty_map + let latest_lsn = layer_map + .iter_historic_layers() + .map(|l| l.get_lsn_range().end) + .max() + .unwrap(); + let partitioning = uniform_key_partitioning(&layer_map, latest_lsn); + + // Check correctness of get_difficulty_map + // TODO put this in a dedicated test outside of this mod + { + println!("running correctness check"); + + let now = Instant::now(); + let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning); + assert!(result_bruteforce.len() == partitioning.parts.len()); + println!("Finished bruteforce in {:?}", now.elapsed()); + + let now = Instant::now(); + let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None); + assert!(result_fast.len() == partitioning.parts.len()); + println!("Finished fast in {:?}", now.elapsed()); + + // Assert results are equal. Manually iterate for easier debugging. + let zip = std::iter::zip( + &partitioning.parts, + std::iter::zip(result_bruteforce, result_fast), + ); + for (_part, (bruteforce, fast)) in zip { + assert_eq!(bruteforce, fast); + } + + println!("No issues found"); + } + + // Define and name the benchmark function + let mut group = c.benchmark_group("real_map"); + group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { layer_map.search(q.0, q.1); } }); }); + group.bench_function("get_difficulty_map", |b| { + b.iter(|| { + layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3)); + }); + }); + group.finish(); } // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. fn bench_sequential(c: &mut Criterion) { - let mut layer_map: LayerMap = LayerMap::default(); - // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. // // TODO This code is pretty slow and runs even if we're only running other @@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) { // Putting it inside the `bench_function` closure is not a solution // because then it runs multiple times during warmup. let now = Instant::now(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); for i in 0..100_000 { - // TODO try inserting a super-wide layer in between every 10 to reflect - // what often happens with L1 layers that include non-rel changes. - // Maybe do that as a separate test. let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); - let layer = DummyImage { - key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1), - lsn: Lsn(10 * i), + let layer = LayerDescriptor { + key: zero.add(10 * i32)..zero.add(10 * i32 + 1), + lsn: Lsn(i)..Lsn(i + 1), + is_incremental: false, + short_id: format!("Layer {}", i), }; - layer_map.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)); } - - // Manually measure runtime without criterion because criterion - // has a minimum sample size of 10 and I don't want to run it 10 times. - println!("Finished init in {:?}", now.elapsed()); + updates.flush(); + println!("Finished layer map init in {:?}", now.elapsed()); // Choose 100 uniformly random queries let rng = &mut StdRng::seed_from_u64(1); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map) - .choose_multiple(rng, 1) + .choose_multiple(rng, 100) .copied() .collect(); // Define and name the benchmark function - c.bench_function("sequential_uniform_queries", |b| { - // Run the search queries + let mut group = c.benchmark_group("sequential"); + group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { layer_map.search(q.0, q.1); } }); }); + group.finish(); } criterion_group!(group_1, bench_from_captest_env); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index f1d92ac36b..06d4853274 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -27,6 +27,7 @@ use tracing::*; /// use tokio_tar::{Builder, EntryType, Header}; +use crate::context::RequestContext; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>( req_lsn: Option, prev_lsn: Option, full_backup: bool, + ctx: &'a RequestContext, ) -> anyhow::Result<()> where W: AsyncWrite + Send + Sync + Unpin, @@ -110,6 +112,7 @@ where lsn: backup_lsn, prev_record_lsn: prev_lsn, full_backup, + ctx, }; basebackup .send_tarball() @@ -129,6 +132,7 @@ where lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, + ctx: &'a RequestContext, } impl<'a, W> Basebackup<'a, W> @@ -171,23 +175,37 @@ where SlruKind::MultiXactOffsets, SlruKind::MultiXactMembers, ] { - for segno in self.timeline.list_slru_segments(kind, self.lsn).await? { + for segno in self + .timeline + .list_slru_segments(kind, self.lsn, self.ctx) + .await? + { self.add_slru_segment(kind, segno).await?; } } // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? { + for ((spcnode, dbnode), has_relmap_file) in + self.timeline.list_dbdirs(self.lsn, self.ctx).await? + { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? { + for rel in self + .timeline + .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .await? + { self.add_rel(rel).await?; } } } - for xid in self.timeline.list_twophase_files(self.lsn).await? { + for xid in self + .timeline + .list_twophase_files(self.lsn, self.ctx) + .await? + { self.add_twophase_file(xid).await?; } @@ -203,7 +221,10 @@ where } async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?; + let nblocks = self + .timeline + .get_rel_size(tag, self.lsn, false, self.ctx) + .await?; // If the relation is empty, create an empty file if nblocks == 0 { @@ -223,7 +244,7 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } @@ -245,14 +266,14 @@ where async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self .timeline - .get_slru_segment_size(slru, segno, self.lsn) + .get_slru_segment_size(slru, segno, self.lsn, self.ctx) .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { let img = self .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx) .await?; if slru == SlruKind::Clog { @@ -287,7 +308,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, self.lsn) + .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) .await?; ensure!(img.len() == 512); Some(img) @@ -323,7 +344,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn) + .list_rels(spcnode, dbnode, self.lsn, self.ctx) .await? .is_empty() { @@ -356,7 +377,10 @@ where // Extract twophase state files // async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self.timeline.get_twophase_file(xid, self.lsn).await?; + let img = self + .timeline + .get_twophase_file(xid, self.lsn, self.ctx) + .await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -394,12 +418,12 @@ where let checkpoint_bytes = self .timeline - .get_checkpoint(self.lsn) + .get_checkpoint(self.lsn, self.ctx) .await .context("failed to get checkpoint bytes")?; let pg_control_bytes = self .timeline - .get_control_file(self.lsn) + .get_control_file(self.lsn, self.ctx) .await .context("failed get control bytes")?; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5de6e4def5..f2cd93bd3a 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -13,6 +13,7 @@ use tracing::*; use metrics::set_build_info_metric; use pageserver::{ config::{defaults::*, PageServerConf}, + context::{DownloadBehavior, RequestContext}, http, page_cache, page_service, task_mgr, task_mgr::TaskKind, task_mgr::{ @@ -26,7 +27,7 @@ use utils::{ logging, postgres_backend::AuthType, project_git_version, - sentry_init::{init_sentry, release_name}, + sentry_init::init_sentry, signals::{self, Signal}, tcp_listener, }; @@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> { }; // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]); + let _sentry_guard = init_sentry( + Some(GIT_VERSION.into()), + &[("node_id", &conf.id.to_string())], + ); let tenants_path = conf.tenants_path(); if !tenants_path.exists() { @@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { let signals = signals::install_shutdown_handlers()?; // Launch broker client - WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?; + WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; // Initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { ); if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { + let metrics_ctx = RequestContext::todo_child( + TaskKind::MetricsCollection, + // This task itself shouldn't download anything. + // The actual size calculation does need downloads, and + // creates a child context with the right DownloadBehavior. + DownloadBehavior::Error, + ); task_mgr::spawn( MGMT_REQUEST_RUNTIME.handle(), TaskKind::MetricsCollection, @@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { conf.metric_collection_interval, conf.synthetic_size_calculation_interval, conf.id, + metrics_ctx, ) .instrument(info_span!("metrics_collection")) .await?; @@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), - TaskKind::LibpqEndpointListener, - None, - None, - "libpq endpoint listener", - true, - async move { - page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await - }, - ); + { + let libpq_ctx = RequestContext::todo_child( + TaskKind::LibpqEndpointListener, + // listener task shouldn't need to download anything. (We will + // create a separate sub-contexts for each connection, with their + // own download behavior. This context is used only to listen and + // accept connections.) + DownloadBehavior::Error, + ); + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, + None, + None, + "libpq endpoint listener", + true, + async move { + page_service::libpq_listener_main( + conf, + auth, + pageserver_listener, + conf.auth_type, + libpq_ctx, + ) + .await + }, + ); + } // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs new file mode 100644 index 0000000000..6c92967ca3 --- /dev/null +++ b/pageserver/src/broker_client.rs @@ -0,0 +1,48 @@ +//! The broker client instance of the pageserver, created during pageserver startup. +//! Used by each timelines' [`walreceiver`]. + +use crate::config::PageServerConf; + +use anyhow::Context; +use once_cell::sync::OnceCell; +use storage_broker::BrokerClientChannel; +use tracing::*; + +static BROKER_CLIENT: OnceCell = OnceCell::new(); + +/// +/// Initialize the broker client. This must be called once at page server startup. +/// +pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> { + let broker_endpoint = conf.broker_endpoint.clone(); + + // Note: we do not attempt connecting here (but validate endpoints sanity). + let broker_client = + storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context( + format!( + "Failed to create broker client to {}", + &conf.broker_endpoint + ), + )?; + + if BROKER_CLIENT.set(broker_client).is_err() { + panic!("broker already initialized"); + } + + info!( + "Initialized broker client with endpoints: {}", + broker_endpoint + ); + Ok(()) +} + +/// +/// Get a handle to the broker client +/// +pub fn get_broker_client() -> &'static BrokerClientChannel { + BROKER_CLIENT.get().expect("broker client not initialized") +} + +pub fn is_broker_client_initialized() -> bool { + BROKER_CLIENT.get().is_some() +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 51d1664e52..a3b051279d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -158,6 +158,8 @@ pub struct PageServerConf { pub synthetic_size_calculation_interval: Duration, pub test_remote_failures: u64, + + pub ondemand_download_behavior_treat_error_as_warn: bool, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -222,6 +224,8 @@ struct PageServerConfigBuilder { synthetic_size_calculation_interval: BuilderValue, test_remote_failures: BuilderValue, + + ondemand_download_behavior_treat_error_as_warn: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder { metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), test_remote_failures: Set(0), + + ondemand_download_behavior_treat_error_as_warn: Set(false), } } } @@ -363,6 +369,14 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } + pub fn ondemand_download_behavior_treat_error_as_warn( + &mut self, + ondemand_download_behavior_treat_error_as_warn: bool, + ) { + self.ondemand_download_behavior_treat_error_as_warn = + BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); + } + pub fn build(self) -> anyhow::Result { Ok(PageServerConf { listen_pg_addr: self @@ -422,6 +436,11 @@ impl PageServerConfigBuilder { test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, + ondemand_download_behavior_treat_error_as_warn: self + .ondemand_download_behavior_treat_error_as_warn + .ok_or(anyhow!( + "missing ondemand_download_behavior_treat_error_as_warn" + ))?, }) } } @@ -600,6 +619,7 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), + "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -724,6 +744,7 @@ impl PageServerConf { metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), test_remote_failures: 0, + ondemand_download_behavior_treat_error_as_warn: false, } } } @@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { Ok(i as u64) } +fn parse_toml_bool(name: &str, item: &Item) -> Result { + item.as_bool() + .with_context(|| format!("configure option {name} is not a bool")) +} + fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() @@ -907,6 +933,7 @@ log_format = 'json' defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, test_remote_failures: 0, + ondemand_download_behavior_treat_error_as_warn: false, }, "Correct defaults should be used when no config values are provided" ); @@ -954,6 +981,7 @@ log_format = 'json' metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), test_remote_failures: 0, + ondemand_download_behavior_treat_error_as_warn: false, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index c07026261d..d848ec5ee5 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -3,6 +3,7 @@ //! and push them to a HTTP endpoint. //! Cache metrics to send only the updated ones. //! +use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::mgr; use anyhow; @@ -47,12 +48,15 @@ pub async fn collect_metrics( metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, node_id: NodeId, + ctx: RequestContext, ) -> anyhow::Result<()> { let mut ticker = tokio::time::interval(metric_collection_interval); info!("starting collect_metrics"); // spin up background worker that caclulates tenant sizes + let worker_ctx = + ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::CalculateSyntheticSize, @@ -61,7 +65,7 @@ pub async fn collect_metrics( "synthetic size calculation", false, async move { - calculate_synthetic_size_worker(synthetic_size_calculation_interval) + calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx) .instrument(info_span!("synthetic_size_worker")) .await?; Ok(()) @@ -79,7 +83,7 @@ pub async fn collect_metrics( return Ok(()); }, _ = ticker.tick() => { - if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await + if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await { error!("metrics collection failed: {err:?}"); } @@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration( cached_metrics: &mut HashMap, metric_collection_endpoint: &reqwest::Url, node_id: NodeId, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new(); trace!( @@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration( ); // get list of tenants - let tenants = mgr::list_tenants().await; + let tenants = mgr::list_tenants().await?; // iterate through list of Active tenants and collect metrics for (tenant_id, tenant_state) in tenants { @@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration( timeline_written_size, )); - let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?; + let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?; // Only send timeline logical size when it is fully calculated. if is_exact { current_metrics.push(( @@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration( /// Caclculate synthetic size for each active tenant pub async fn calculate_synthetic_size_worker( synthetic_size_calculation_interval: Duration, + ctx: &RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); @@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker( }, _ = ticker.tick() => { - let tenants = mgr::list_tenants().await; + let tenants = match mgr::list_tenants().await { + Ok(tenants) => tenants, + Err(e) => { + warn!("cannot get tenant list: {e:#}"); + continue; + } + }; // iterate through list of Active tenants and collect metrics for (tenant_id, tenant_state) in tenants { @@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker( if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { - if let Err(e) = tenant.calculate_synthetic_size().await { + if let Err(e) = tenant.calculate_synthetic_size(ctx).await { error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e); } } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs new file mode 100644 index 0000000000..e826d28e6d --- /dev/null +++ b/pageserver/src/context.rs @@ -0,0 +1,199 @@ +//! This module defines `RequestContext`, a structure that we use throughout +//! the pageserver to propagate high-level context from places +//! that _originate_ activity down to the shared code paths at the +//! heart of the pageserver. It's inspired by Golang's `context.Context`. +//! +//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions: +//! 1. What high-level activity ([`TaskKind`]) needs this page? +//! We need that information as a categorical dimension for page access +//! statistics, which we, in turn, need to guide layer eviction policy design. +//! 2. How should we behave if, to produce the page image, we need to +//! on-demand download a layer file ([`DownloadBehavior`]). +//! +//! [`RequestContext`] satisfies those needs. +//! The current implementation is a small `struct` that is passed through +//! the call chain by reference. +//! +//! ### Future Work +//! +//! However, we do not intend to stop here, since there are other needs that +//! require carrying information from high to low levels of the app. +//! +//! Most importantly, **cancellation signaling** in response to +//! 1. timeouts (page_service max response time) and +//! 2. lifecycle requests (detach tenant, delete timeline). +//! +//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned +//! by the transitive callees of a request have finished. The keyword here +//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places, +//! `TaskHandle` in some places, and careful code review around `FuturesUnordered` +//! or `JoinSet` in other places. +//! +//! We do not yet have a systematic cancellation story in pageserver, and it is +//! pretty clear that [`RequestContext`] will be responsible for that. +//! So, the API already prepares for this role through the +//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`] methods. +//! See their doc comments for details on how we will use them in the future. +//! +//! It is not clear whether or how we will enforce Structured Concurrency, and +//! what role [`RequestContext`] will play there. +//! So, the API doesn't prepare us for this topic. +//! +//! Other future uses of `RequestContext`: +//! - Communicate compute & IO priorities (user-initiated request vs. background-loop) +//! - Request IDs for distributed tracing +//! - Request/Timeline/Tenant-scoped log levels +//! +//! RequestContext might look quite different once it supports those features. +//! Likely, it will have a shape similar to Golang's `context.Context`. +//! +//! ### Why A Struct Instead Of Method Parameters +//! +//! What's typical about such information is that it needs to be passed down +//! along the call chain from high level to low level, but few of the functions +//! in the middle need to understand it. +//! Further, it is to be expected that we will need to propagate more data +//! in the future (see the earlier section on future work). +//! Hence, for functions in the middle of the call chain, we have the following +//! requirements: +//! 1. It should be easy to forward the context to callees. +//! 2. To propagate more data from high-level to low-level code, the functions in +//! the middle should not need to be modified. +//! The solution is to have a container structure ([`RequestContext`]) that +//! carries the information. Functions that don't care about what's in it +//! pass it along to callees. +//! +//! ### Why Not Task-Local Variables +//! +//! One could use task-local variables (the equivalent of thread-local variables) +//! to address the immediate needs outlined above. +//! However, we reject task-local variables because: +//! 1. they are implicit, thereby making it harder to trace the data flow in code +//! reviews and during debugging, +//! 2. they can be mutable, which enables implicit return data flow, +//! 3. they are restrictive in that code which fans out into multiple tasks, +//! or even threads, needs to carefully propagate the state. +//! +//! In contrast, information flow with [`RequestContext`] is +//! 1. always explicit, +//! 2. strictly uni-directional because RequestContext is immutable, +//! 3. tangible because a [`RequestContext`] is just a value. +//! When creating child activities, regardless of whether it's a task, +//! thread, or even an RPC to another service, the value can +//! be used like any other argument. +//! +//! The solution is that all code paths are infected with precisely one +//! [`RequestContext`] argument. Functions in the middle of the call chain +//! only need to pass it on. +use crate::task_mgr::TaskKind; + +// The main structure of this module, see module-level comment. +pub struct RequestContext { + task_kind: TaskKind, + download_behavior: DownloadBehavior, +} + +/// Desired behavior if the operation requires an on-demand download +/// to proceed. +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum DownloadBehavior { + /// Download the layer file. It can take a while. + Download, + + /// Download the layer file, but print a warning to the log. This should be used + /// in code where the layer file is expected to already exist locally. + Warn, + + /// Return a PageReconstructError::NeedsDownload error + Error, +} + +impl RequestContext { + /// Create a new RequestContext that has no parent. + /// + /// The function is called `new` because, once we add children + /// to it using `detached_child` or `attached_child`, the context + /// form a tree (not implemented yet since cancellation will be + /// the first feature that requires a tree). + /// + /// # Future: Cancellation + /// + /// The only reason why a context like this one can be canceled is + /// because someone explicitly canceled it. + /// It has no parent, so it cannot inherit cancellation from there. + pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { + RequestContext { + task_kind, + download_behavior, + } + } + + /// Create a detached child context for a task that may outlive `self`. + /// + /// Use this when spawning new background activity that should complete + /// even if the current request is canceled. + /// + /// # Future: Cancellation + /// + /// Cancellation of `self` will not propagate to the child context returned + /// by this method. + /// + /// # Future: Structured Concurrency + /// + /// We could add the Future as a parameter to this function, spawn it as a task, + /// and pass to the new task the child context as an argument. + /// That would be an ergonomic improvement. + /// + /// We could make new calls to this function fail if `self` is already canceled. + pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { + self.child_impl(task_kind, download_behavior) + } + + /// Create a child of context `self` for a task that shall not outlive `self`. + /// + /// Use this when fanning-out work to other async tasks. + /// + /// # Future: Cancellation + /// + /// Cancelling a context will propagate to its attached children. + /// + /// # Future: Structured Concurrency + /// + /// We could add the Future as a parameter to this function, spawn it as a task, + /// and track its `JoinHandle` inside the `RequestContext`. + /// + /// We could then provide another method to allow waiting for all child tasks + /// to finish. + /// + /// We could make new calls to this function fail if `self` is already canceled. + /// Alternatively, we could allow the creation but not spawn the task. + /// The method to wait for child tasks would return an error, indicating + /// that the child task was not started because the context was canceled. + pub fn attached_child(&self) -> Self { + self.child_impl(self.task_kind(), self.download_behavior()) + } + + /// Use this function when you should be creating a child context using + /// [`attached_child`] or [`detached_child`], but your caller doesn't provide + /// a context and you are unwilling to change all callers to provide one. + /// + /// Before we add cancellation, we should get rid of this method. + pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { + Self::new(task_kind, download_behavior) + } + + fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { + RequestContext { + task_kind, + download_behavior, + } + } + + pub fn task_kind(&self) -> TaskKind { + self.task_kind + } + + pub fn download_behavior(&self) -> DownloadBehavior { + self.download_behavior + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index f9b8a81dad..23faff7ace 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -430,6 +430,13 @@ paths: schema: type: string format: hex + - name: inputs_only + in: query + required: false + schema: + type: boolean + description: | + When true, skip calculation and only provide the model inputs (for debugging). Defaults to false. get: description: | Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). @@ -449,8 +456,9 @@ paths: format: hex size: type: integer + nullable: true description: | - Size metric in bytes. + Size metric in bytes or null if inputs_only=true was given. "401": description: Unauthorized Error content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1eb24c1507..a7802f3cbe 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,8 +12,11 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo, }; +use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; +use crate::tenant::mgr::TenantMapInsertError; use crate::tenant::{PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; use utils::{ @@ -81,18 +84,39 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res fn apierror_from_prerror(err: PageReconstructError) -> ApiError { match err { PageReconstructError::Other(err) => ApiError::InternalServerError(err), + PageReconstructError::NeedsDownload(_, _) => { + // This shouldn't happen, because we use a RequestContext that requests to + // download any missing layer files on-demand. + ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) + } + PageReconstructError::Cancelled => { + ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) + } PageReconstructError::WalRedo(err) => { ApiError::InternalServerError(anyhow::Error::new(err)) } } } +fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError { + match e { + TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { + ApiError::InternalServerError(anyhow::Error::new(e)) + } + TenantMapInsertError::TenantAlreadyExists(id, state) => { + ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } + TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, + ctx: &RequestContext, ) -> anyhow::Result { - let mut info = build_timeline_info_common(timeline)?; + let mut info = build_timeline_info_common(timeline, ctx)?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while @@ -102,6 +126,7 @@ async fn build_timeline_info( .get_current_logical_size_non_incremental( info.last_record_lsn, CancellationToken::new(), + ctx, ) .await?, ); @@ -109,7 +134,10 @@ async fn build_timeline_info( Ok(info) } -fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result { +fn build_timeline_info_common( + timeline: &Arc, + ctx: &RequestContext, +) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result None, lsn @ Lsn(_) => Some(lsn), }; - let current_logical_size = match timeline.get_current_logical_size() { + let current_logical_size = match timeline.get_current_logical_size(ctx) { Ok((size, _)) => Some(size), Err(err) => { error!("Timeline info creation failed to get current logical size: {err:?}"); @@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline) + let timeline_info = build_timeline_info_common(&new_timeline, &ctx) .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } @@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-logical-size"); check_permission(&request, Some(tenant_id))?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let response_data = async { let tenant = mgr::get_tenant(tenant_id, true) .await @@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request) -> Result, let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { let timeline_info = - build_timeline_info(&timeline, include_non_incremental_logical_size) + build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx) .await .context( "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}", @@ -239,11 +272,7 @@ fn query_param_present(request: &Request, param: &str) -> bool { request .uri() .query() - .map(|v| { - url::form_urlencoded::parse(v.as_bytes()) - .into_owned() - .any(|(p, _)| p == param) - }) + .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param)) .unwrap_or(false) } @@ -252,13 +281,12 @@ fn get_query_param(request: &Request, param_name: &str) -> Result) -> Result) -> Result(timeline_info) } @@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result) -> Result, let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + info!("Handling tenant attach {tenant_id}"); let state = get_state(&request); if let Some(remote_storage) = &state.remote_storage { - // FIXME: distinguish between "Tenant already exists" and other errors - mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone()) + mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx) .instrument(info_span!("tenant_attach", tenant = %tenant_id)) .await - .map_err(ApiError::InternalServerError)?; + .map_err(apierror_from_tenant_map_insert_error)?; } else { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" @@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, A let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let state = get_state(&request); - mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone()) + mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx) .instrument(info_span!("load", tenant = %tenant_id)) .await - .map_err(ApiError::InternalServerError)?; + .map_err(apierror_from_tenant_map_insert_error)?; json_response(StatusCode::ACCEPTED, ()) } @@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request) -> Result, A let response_data = mgr::list_tenants() .instrument(info_span!("tenant_list")) .await + .map_err(anyhow::Error::new) + .map_err(ApiError::InternalServerError)? .iter() .map(|(id, state)| TenantInfo { id: *id, @@ -453,21 +493,40 @@ async fn tenant_status(request: Request) -> Result, ApiErro json_response(StatusCode::OK, tenant_info) } +/// HTTP endpoint to query the current tenant_size of a tenant. +/// +/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used +/// to debug any of the calculations. Requires `tenant_id` request parameter, supports +/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model +/// values. async fn tenant_size_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let inputs_only = if query_param_present(&request, "inputs_only") { + get_query_param(&request, "inputs_only")? + .parse() + .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))? + } else { + false + }; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let tenant = mgr::get_tenant(tenant_id, true) .await .map_err(ApiError::InternalServerError)?; - // this can be long operation, it currently is not backed by any request coalescing or similar + // this can be long operation let inputs = tenant - .gather_size_inputs() + .gather_size_inputs(&ctx) .await .map_err(ApiError::InternalServerError)?; - let size = inputs.calculate().map_err(ApiError::InternalServerError)?; + let size = if !inputs_only { + Some(inputs.calculate().map_err(ApiError::InternalServerError)?) + } else { + None + }; /// Private response type with the additional "unstable" `inputs` field. /// @@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request) -> Result, A #[serde_as(as = "serde_with::DisplayFromStr")] id: TenantId, /// Size is a mixture of WAL and logical size, so the unit is bytes. - size: u64, + /// + /// Will be none if `?inputs_only=true` was given. + size: Option, inputs: crate::tenant::size::ModelInputs, } @@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let request_data: TenantCreateRequest = json_request(&mut request).await?; let mut tenant_conf = TenantConfOpt::default(); @@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request) -> Result { - // We created the tenant. Existing API semantics are that the tenant - // is Active when this function returns. - if let res @ Err(_) = tenant.wait_to_become_active().await { - // This shouldn't happen because we just created the tenant directory - // in tenant::mgr::create_tenant, and there aren't any remote timelines - // to load, so, nothing can really fail during load. - // Don't do cleanup because we don't know how we got here. - // The tenant will likely be in `Broken` state and subsequent - // calls will fail. - res.context("created tenant failed to become active") - .map_err(ApiError::InternalServerError)?; - } - json_response( - StatusCode::CREATED, - TenantCreateResponse(tenant.tenant_id()), - )? - } - None => json_response(StatusCode::CONFLICT, ())?, - }) + // We created the tenant. Existing API semantics are that the tenant + // is Active when this function returns. + if let res @ Err(_) = new_tenant.wait_to_become_active().await { + // This shouldn't happen because we just created the tenant directory + // in tenant::mgr::create_tenant, and there aren't any remote timelines + // to load, so, nothing can really fail during load. + // Don't do cleanup because we don't know how we got here. + // The tenant will likely be in `Broken` state and subsequent + // calls will fail. + res.context("created tenant failed to become active") + .map_err(ApiError::InternalServerError)?; + } + json_response( + StatusCode::CREATED, + TenantCreateResponse(new_tenant.tenant_id()), + ) } async fn tenant_config_handler(mut request: Request) -> Result, ApiError> { @@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result Result<()> { let mut pg_control: Option = None; @@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir( let mut file = tokio::fs::File::open(absolute_path).await?; let len = metadata.len() as usize; if let Some(control_file) = - import_file(&mut modification, relative_path, &mut file, len).await? + import_file(&mut modification, relative_path, &mut file, len, ctx).await? { pg_control = Some(control_file); } @@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir( tline, Lsn(pg_control.checkPointCopy.redo), pgdata_lsn, + ctx, ) .await?; @@ -113,6 +116,7 @@ async fn import_rel( dboid: Oid, reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); @@ -147,7 +151,10 @@ async fn import_rel( // FIXME: use proper error type for this, instead of parsing the error message. // Or better yet, keep track of which relations we've already created // https://github.com/neondatabase/neon/issues/3309 - if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await { + if let Err(e) = modification + .put_rel_creation(rel, nblocks as u32, ctx) + .await + { if e.to_string().contains("already exists") { debug!("relation {} already exists. we must be extending it", rel); } else { @@ -182,7 +189,7 @@ async fn import_rel( // // If we process rel segments out of order, // put_rel_extend will skip the update. - modification.put_rel_extend(rel, blknum).await?; + modification.put_rel_extend(rel, blknum, ctx).await?; Ok(()) } @@ -195,6 +202,7 @@ async fn import_slru( path: &Path, reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, + ctx: &RequestContext, ) -> anyhow::Result<()> { info!("importing slru file {path:?}"); @@ -211,7 +219,7 @@ async fn import_slru( ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); modification - .put_slru_segment_creation(slru, segno, nblocks as u32) + .put_slru_segment_creation(slru, segno, nblocks as u32, ctx) .await?; let mut rpageno = 0; @@ -252,15 +260,15 @@ async fn import_wal( tline: &Timeline, startpoint: Lsn, endpoint: Lsn, + ctx: &RequestContext, ) -> anyhow::Result<()> { - use std::io::Read; let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(tline, startpoint).await?; + let mut walingest = WalIngest::new(tline, startpoint, ctx).await?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -283,6 +291,7 @@ async fn import_wal( file.seek(std::io::SeekFrom::Start(offset as u64))?; } + use std::io::Read; let nread = file.read_to_end(&mut buf)?; if nread != WAL_SEGMENT_SIZE - offset { // Maybe allow this for .partial files? @@ -297,7 +306,7 @@ async fn import_wal( while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; last_lsn = lsn; @@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar( tline: &Timeline, reader: &mut (impl AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, + ctx: &RequestContext, ) -> Result<()> { info!("importing base at {base_lsn}"); let mut modification = tline.begin_modification(base_lsn); @@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar( match header.entry_type() { tokio_tar::EntryType::Regular => { if let Some(res) = - import_file(&mut modification, file_path.as_ref(), &mut entry, len).await? + import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await? { // We found the pg_control file. pg_control = Some(res); @@ -376,13 +386,14 @@ pub async fn import_wal_from_tar( reader: &mut (impl AsyncRead + Send + Sync + Unpin), start_lsn: Lsn, end_lsn: Lsn, + ctx: &RequestContext, ) -> Result<()> { // Set up walingest mutable state let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; - let mut walingest = WalIngest::new(tline, start_lsn).await?; + let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); @@ -431,7 +442,7 @@ pub async fn import_wal_from_tar( while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; last_lsn = lsn; @@ -466,6 +477,7 @@ async fn import_file( file_path: &Path, reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, + ctx: &RequestContext, ) -> Result> { let file_name = match file_path.file_name() { Some(name) => name.to_string_lossy(), @@ -498,14 +510,16 @@ async fn import_file( } "pg_filenode.map" => { let bytes = read_all_bytes(reader).await?; - modification.put_relmap_file(spcnode, dbnode, bytes).await?; + modification + .put_relmap_file(spcnode, dbnode, bytes, ctx) + .await?; debug!("imported relmap file") } "PG_VERSION" => { debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; + import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?; debug!("imported rel creation"); } } @@ -521,38 +535,40 @@ async fn import_file( match file_name.as_ref() { "pg_filenode.map" => { let bytes = read_all_bytes(reader).await?; - modification.put_relmap_file(spcnode, dbnode, bytes).await?; + modification + .put_relmap_file(spcnode, dbnode, bytes, ctx) + .await?; debug!("imported relmap file") } "PG_VERSION" => { debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; + import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?; debug!("imported rel creation"); } } } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; - import_slru(modification, slru, file_path, reader, len).await?; + import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported clog slru"); } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; - import_slru(modification, slru, file_path, reader, len).await?; + import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact offsets slru"); } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; - import_slru(modification, slru, file_path, reader, len).await?; + import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { let xid = u32::from_str_radix(file_name.as_ref(), 16)?; let bytes = read_all_bytes(reader).await?; modification - .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..])) + .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; debug!("imported twophase file"); } else if file_path.starts_with("pg_wal") { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 91cde477ad..09e21ae755 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,7 +1,9 @@ mod auth; pub mod basebackup; +pub mod broker_client; pub mod config; pub mod consumption_metrics; +pub mod context; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -15,7 +17,6 @@ pub mod tenant; pub mod trace; pub mod virtual_file; pub mod walingest; -pub mod walreceiver; pub mod walrecord; pub mod walredo; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b61e64048b..6bd0eddbb5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,10 +1,12 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ - register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, + UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; +use pageserver_api::models::state; use utils::id::{TenantId, TimelineId}; /// Prometheus histogram buckets (in seconds) that capture the majority of @@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[ "gc", ]; -pub static STORAGE_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", +pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { + register_counter_vec!( + "pageserver_storage_operations_seconds_sum", + "Total time spent on storage operations with operation, tenant and timeline dimensions", &["operation", "tenant_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_storage_operations_seconds_count", + "Count of storage operations with operation, tenant and timeline dimensions", + &["operation", "tenant_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_storage_operations_seconds_global", + "Time spent on storage operations", + &["operation"], get_buckets_for_critical_operations(), ) .expect("failed to define a metric") @@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +// Metrics collected on tenant states. +const TENANT_STATE_OPTIONS: &[&str] = &[ + state::LOADING, + state::ATTACHING, + state::ACTIVE, + state::STOPPING, + state::BROKEN, +]; + +pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tenant_states_count", + "Count of tenants per state", + &["tenant_id", "state"] + ) + .expect("Failed to register pageserver_tenant_states_count metric") +}); + // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { @@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +/// Similar to [`prometheus::HistogramTimer`] but does not record on drop. +pub struct StorageTimeMetricsTimer { + metrics: StorageTimeMetrics, + start: Instant, +} + +impl StorageTimeMetricsTimer { + fn new(metrics: StorageTimeMetrics) -> Self { + Self { + metrics, + start: Instant::now(), + } + } + + /// Record the time from creation to now. + pub fn stop_and_record(self) { + let duration = self.start.elapsed().as_secs_f64(); + self.metrics.timeline_sum.inc_by(duration); + self.metrics.timeline_count.inc(); + self.metrics.global_histogram.observe(duration); + } +} + +/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and +/// timeline total sum and count. +#[derive(Clone, Debug)] +pub struct StorageTimeMetrics { + /// Sum of f64 seconds, per operation, tenant_id and timeline_id + timeline_sum: Counter, + /// Number of oeprations, per operation, tenant_id and timeline_id + timeline_count: IntCounter, + /// Global histogram having only the "operation" label. + global_histogram: Histogram, +} + +impl StorageTimeMetrics { + pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self { + let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE + .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) + .unwrap(); + let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE + .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) + .unwrap(); + let global_histogram = STORAGE_TIME_GLOBAL + .get_metric_with_label_values(&[operation]) + .unwrap(); + + StorageTimeMetrics { + timeline_sum, + timeline_count, + global_histogram, + } + } + + /// Starts timing a new operation. + /// + /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop. + pub fn start_timer(&self) -> StorageTimeMetricsTimer { + StorageTimeMetricsTimer::new(self.clone()) + } +} + #[derive(Debug)] pub struct TimelineMetrics { tenant_id: String, timeline_id: String, pub reconstruct_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, - pub flush_time_histo: Histogram, - pub compact_time_histo: Histogram, - pub create_images_time_histo: Histogram, - pub init_logical_size_histo: Histogram, - pub logical_size_histo: Histogram, - pub load_layer_map_histo: Histogram, + pub flush_time_histo: StorageTimeMetrics, + pub compact_time_histo: StorageTimeMetrics, + pub create_images_time_histo: StorageTimeMetrics, + pub init_logical_size_histo: StorageTimeMetrics, + pub logical_size_histo: StorageTimeMetrics, + pub load_layer_map_histo: StorageTimeMetrics, + pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub resident_physical_size_gauge: UIntGauge, @@ -406,24 +507,16 @@ impl TimelineMetrics { let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) - .unwrap(); - let init_logical_size_histo = STORAGE_TIME - .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) - .unwrap(); - let logical_size_histo = STORAGE_TIME - .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id]) - .unwrap(); - let load_layer_map_histo = STORAGE_TIME - .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) - .unwrap(); + let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id); + let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id); + let create_images_time_histo = + StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); + let init_logical_size_histo = + StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id); + let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); + let load_layer_map_histo = + StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); + let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -453,6 +546,7 @@ impl TimelineMetrics { create_images_time_histo, init_logical_size_histo, logical_size_histo, + garbage_collect_histo, load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, @@ -478,7 +572,10 @@ impl Drop for TimelineMetrics { let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); for op in STORAGE_TIME_OPERATIONS { - let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + let _ = + STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); + let _ = + STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); } for op in STORAGE_IO_TIME_OPERATIONS { let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]); @@ -495,7 +592,10 @@ impl Drop for TimelineMetrics { } pub fn remove_tenant_metrics(tenant_id: &TenantId) { - let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); + let tid = tenant_id.to_string(); + for state in TENANT_STATE_OPTIONS { + let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]); + } } use futures::Future; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 344a8d1c00..878928ae06 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,6 +13,7 @@ use anyhow::Context; use bytes::Buf; use bytes::Bytes; use futures::{Stream, StreamExt}; +use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, @@ -30,19 +31,19 @@ use std::sync::Arc; use std::time::Duration; use tracing::*; use utils::id::ConnectionId; -use utils::postgres_backend_async::QueryError; use utils::{ auth::{Claims, JwtAuth, Scope}, id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - postgres_backend_async::{self, PostgresBackend}, + postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError}, simple_rcu::RcuReadGuard, }; use crate::auth::check_permission; use crate::basebackup; use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::task_mgr; @@ -123,6 +124,7 @@ pub async fn libpq_listener_main( auth: Option>, listener: TcpListener, auth_type: AuthType, + listener_ctx: RequestContext, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; let tokio_listener = tokio::net::TcpListener::from_std(listener)?; @@ -146,6 +148,9 @@ pub async fn libpq_listener_main( debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); + let connection_ctx = listener_ctx + .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); + // PageRequestHandler tasks are not associated with any particular // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one @@ -157,7 +162,7 @@ pub async fn libpq_listener_main( None, "serving compute connection task", false, - page_service_conn_main(conf, local_auth, socket, auth_type), + page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx), ); } Err(err) => { @@ -177,6 +182,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + connection_ctx: RequestContext, ) -> anyhow::Result<()> { // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* @@ -191,24 +197,24 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; - let mut conn_handler = PageServerHandler::new(conf, auth); + // XXX: pgbackend.run() should take the connection_ctx, + // and create a child per-query context when it invokes process_query. + // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler + // and create the per-query context in process_query ourselves. + let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx); let pgbackend = PostgresBackend::new(socket, auth_type, None)?; - let result = pgbackend + match pgbackend .run(&mut conn_handler, task_mgr::shutdown_watcher) - .await; - match result { + .await + { Ok(()) => { // we've been requested to shut down Ok(()) } Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { - // `ConnectionReset` error happens when the Postgres client closes the connection. - // As this disconnection happens quite often and is expected, - // we decided to downgrade the logging level to `INFO`. - // See: https://github.com/neondatabase/neon/issues/1683. - if io_error.kind() == io::ErrorKind::ConnectionReset { - info!("Postgres client disconnected"); + if is_expected_io_error(&io_error) { + info!("Postgres client disconnected ({io_error})"); Ok(()) } else { Err(io_error).context("Postgres connection error") @@ -255,30 +261,42 @@ struct PageServerHandler { _conf: &'static PageServerConf, auth: Option>, claims: Option, + + /// The context created for the lifetime of the connection + /// services by this PageServerHandler. + /// For each query received over the connection, + /// `process_query` creates a child context from this one. + connection_ctx: RequestContext, } impl PageServerHandler { - pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { + pub fn new( + conf: &'static PageServerConf, + auth: Option>, + connection_ctx: RequestContext, + ) -> Self { PageServerHandler { _conf: conf, auth, claims: None, + connection_ctx, } } - #[instrument(skip(self, pgb))] + #[instrument(skip(self, pgb, ctx))] async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, + ctx: RequestContext, ) -> anyhow::Result<()> { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Make request tracer if needed - let tenant = get_active_tenant_with_timeout(tenant_id).await?; + let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; let mut tracer = if tenant.get_trace_read_requests() { let connection_id = ConnectionId::generate(); let path = tenant @@ -329,22 +347,27 @@ impl PageServerHandler { let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + // TODO: We could create a new per-request context here, with unique ID. + // Currently we use the same per-timeline context for all requests + let response = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let _timer = metrics.get_rel_exists.start_timer(); - self.handle_get_rel_exists_request(&timeline, &req).await + self.handle_get_rel_exists_request(&timeline, &req, &ctx) + .await } PagestreamFeMessage::Nblocks(req) => { let _timer = metrics.get_rel_size.start_timer(); - self.handle_get_nblocks_request(&timeline, &req).await + self.handle_get_nblocks_request(&timeline, &req, &ctx).await } PagestreamFeMessage::GetPage(req) => { let _timer = metrics.get_page_at_lsn.start_timer(); - self.handle_get_page_at_lsn_request(&timeline, &req).await + self.handle_get_page_at_lsn_request(&timeline, &req, &ctx) + .await } PagestreamFeMessage::DbSize(req) => { let _timer = metrics.get_db_size.start_timer(); - self.handle_db_size_request(&timeline, &req).await + self.handle_db_size_request(&timeline, &req, &ctx).await } }; @@ -363,7 +386,8 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip(self, pgb))] + #[allow(clippy::too_many_arguments)] + #[instrument(skip(self, pgb, ctx))] async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, @@ -372,12 +396,13 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, + ctx: RequestContext, ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let tenant = get_active_tenant_with_timeout(tenant_id).await?; - let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?; + let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; + let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -396,7 +421,7 @@ impl PageServerHandler { let mut copyin_stream = Box::pin(copyin_stream(pgb)); timeline - .import_basebackup_from_tar(&mut copyin_stream, base_lsn) + .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx) .await?; // Drain the rest of the Copy data @@ -418,7 +443,7 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip(self, pgb))] + #[instrument(skip(self, pgb, ctx))] async fn handle_import_wal( &self, pgb: &mut PostgresBackend, @@ -426,10 +451,11 @@ impl PageServerHandler { timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, + ctx: RequestContext, ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; + let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let last_record_lsn = timeline.get_last_record_lsn(); if last_record_lsn != start_lsn { return Err(QueryError::Other( @@ -446,7 +472,7 @@ impl PageServerHandler { pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); - import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?; + import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); // Drain the rest of the Copy data @@ -492,6 +518,7 @@ impl PageServerHandler { mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RcuReadGuard, + ctx: &RequestContext, ) -> anyhow::Result { if latest { // Latest page version was requested. If LSN is given, it is a hint @@ -515,7 +542,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn).await?; + timeline.wait_lsn(lsn, ctx).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -525,7 +552,7 @@ impl PageServerHandler { if lsn == Lsn(0) { anyhow::bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn).await?; + timeline.wait_lsn(lsn, ctx).await?; } anyhow::ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -535,52 +562,60 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, req: &PagestreamExistsRequest, + ctx: &RequestContext, ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) - .await?; + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; - let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?; + let exists = timeline + .get_rel_exists(req.rel, lsn, req.latest, ctx) + .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, })) } - #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, req: &PagestreamNblocksRequest, + ctx: &RequestContext, ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) - .await?; + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?; + let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, })) } - #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, + ctx: &RequestContext, ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) - .await?; + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest) + .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -589,15 +624,17 @@ impl PageServerHandler { })) } - #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, + ctx: &RequestContext, ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) - .await?; + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; /* // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. @@ -608,7 +645,7 @@ impl PageServerHandler { */ let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) + .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -616,23 +653,25 @@ impl PageServerHandler { })) } - #[instrument(skip(self, pgb))] + #[allow(clippy::too_many_arguments)] + #[instrument(skip(self, pgb, ctx))] async fn handle_basebackup_request( - &self, + &mut self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, + ctx: RequestContext, ) -> anyhow::Result<()> { // check that the timeline exists - let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; + let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn).await?; + timeline.wait_lsn(lsn, &ctx).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -645,8 +684,15 @@ impl PageServerHandler { // Send a tarball of the latest layer on the timeline { let mut writer = pgb.copyout_writer(); - basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup) - .await?; + basebackup::send_basebackup_tarball( + &mut writer, + &timeline, + lsn, + prev_lsn, + full_backup, + &ctx, + ) + .await?; } pgb.write_message(&BeMessage::CopyDone)?; @@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler { pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); if query_string.starts_with("pagestream ") { @@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, tenant_id, timeline_id) + self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); @@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx) .await?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } @@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler { .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; - let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; + let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let end_of_timeline = timeline.get_last_record_rlsn(); @@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx) .await?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { @@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler { base_lsn, end_lsn, pg_version, + ctx, ) .await { @@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; match self - .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) .await { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, @@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; - let tenant = get_active_tenant_with_timeout(tenant_id).await?; + let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), @@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler { } } +#[derive(thiserror::Error, Debug)] +enum GetActiveTenantError { + #[error( + "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}" + )] + WaitForActiveTimeout { + latest_state: TenantState, + wait_time: Duration, + }, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for QueryError { + fn from(e: GetActiveTenantError) -> Self { + match e { + GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( + ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), + ), + GetActiveTenantError::Other(e) => QueryError::Other(e), + } + } +} + /// Get active tenant. /// /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That /// ensures that queries don't fail immediately after pageserver startup, because /// all tenants are still loading. -async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result> { +async fn get_active_tenant_with_timeout( + tenant_id: TenantId, + _ctx: &RequestContext, /* require get a context to support cancellation in the future */ +) -> Result, GetActiveTenantError> { let tenant = mgr::get_tenant(tenant_id, false).await?; - match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await { - Ok(wait_result) => wait_result - // no .context(), the error message is good enough and some tests depend on it - .map(move |()| tenant), - Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"), + let wait_time = Duration::from_secs(30); + match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { + Ok(Ok(())) => Ok(tenant), + // no .context(), the error message is good enough and some tests depend on it + Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)), + Err(_) => { + let latest_state = tenant.current_state(); + if latest_state == TenantState::Active { + Ok(tenant) + } else { + Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state, + wait_time, + }) + } + } } } /// Shorthand for getting a reference to a Timeline of an Active tenant. -async fn get_active_timeline_with_timeout( +async fn get_active_tenant_timeline( tenant_id: TenantId, timeline_id: TimelineId, -) -> anyhow::Result> { - get_active_tenant_with_timeout(tenant_id) - .await - .and_then(|tenant| tenant.get_timeline(timeline_id, true)) + ctx: &RequestContext, +) -> Result, GetActiveTenantError> { + let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?; + let timeline = tenant.get_timeline(timeline_id, true)?; + Ok(timeline) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index cc521c5e35..6f9035305d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,6 +7,7 @@ //! Clarify that) //! use super::tenant::{PageReconstructError, Timeline}; +use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; use crate::walrecord::NeonWalRecord; @@ -97,6 +98,7 @@ impl Timeline { blknum: BlockNumber, lsn: Lsn, latest: bool, + ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { return Err(PageReconstructError::Other(anyhow::anyhow!( @@ -104,7 +106,7 @@ impl Timeline { ))); } - let nblocks = self.get_rel_size(tag, lsn, latest).await?; + let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -114,7 +116,7 @@ impl Timeline { } let key = rel_block_to_key(tag, blknum); - self.get(key, lsn).await + self.get(key, lsn, ctx).await } // Get size of a database in blocks @@ -124,13 +126,14 @@ impl Timeline { dbnode: Oid, lsn: Lsn, latest: bool, + ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn).await?; + let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest).await?; + let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -142,6 +145,7 @@ impl Timeline { tag: RelTag, lsn: Lsn, latest: bool, + ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { return Err(PageReconstructError::Other(anyhow::anyhow!( @@ -154,7 +158,7 @@ impl Timeline { } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest).await? + && !self.get_rel_exists(tag, lsn, latest, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -164,7 +168,7 @@ impl Timeline { } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn).await?; + let mut buf = self.get(key, lsn, ctx).await?; let nblocks = buf.get_u32_le(); if latest { @@ -186,6 +190,7 @@ impl Timeline { tag: RelTag, lsn: Lsn, _latest: bool, + ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { return Err(PageReconstructError::Other(anyhow::anyhow!( @@ -199,7 +204,7 @@ impl Timeline { } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn).await?; + let buf = self.get(key, lsn, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -216,10 +221,11 @@ impl Timeline { spcnode: Oid, dbnode: Oid, lsn: Lsn, + ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn).await?; + let buf = self.get(key, lsn, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -244,9 +250,10 @@ impl Timeline { segno: u32, blknum: BlockNumber, lsn: Lsn, + ctx: &RequestContext, ) -> Result { let key = slru_block_to_key(kind, segno, blknum); - self.get(key, lsn).await + self.get(key, lsn, ctx).await } /// Get size of an SLRU segment @@ -255,9 +262,10 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, + ctx: &RequestContext, ) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn).await?; + let mut buf = self.get(key, lsn, ctx).await?; Ok(buf.get_u32_le()) } @@ -267,10 +275,11 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, + ctx: &RequestContext, ) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn).await?; + let buf = self.get(key, lsn, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -291,6 +300,7 @@ impl Timeline { pub async fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, + ctx: &RequestContext, ) -> Result { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; @@ -313,6 +323,7 @@ impl Timeline { Lsn(mid * 8), &mut found_smaller, &mut found_larger, + ctx, ) .await?; @@ -362,14 +373,18 @@ impl Timeline { probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, + ctx: &RequestContext, ) -> Result { - for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? { + for segno in self + .list_slru_segments(SlruKind::Clog, probe_lsn, ctx) + .await? + { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn) + .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx) .await?; for blknum in (0..nblocks).rev() { let clog_page = self - .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn) + .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx) .await?; if clog_page.len() == BLCKSZ as usize + 8 { @@ -394,11 +409,12 @@ impl Timeline { &self, kind: SlruKind, lsn: Lsn, + ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn).await?; + let buf = self.get(key, lsn, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.segments), Err(e) => Err(PageReconstructError::from(e)), @@ -410,18 +426,21 @@ impl Timeline { spcnode: Oid, dbnode: Oid, lsn: Lsn, + ctx: &RequestContext, ) -> Result { let key = relmap_file_key(spcnode, dbnode); - self.get(key, lsn).await + let buf = self.get(key, lsn, ctx).await?; + Ok(buf) } pub async fn list_dbdirs( &self, lsn: Lsn, + ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry - let buf = self.get(DBDIR_KEY, lsn).await?; + let buf = self.get(DBDIR_KEY, lsn, ctx).await?; match DbDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.dbdirs), @@ -433,18 +452,20 @@ impl Timeline { &self, xid: TransactionId, lsn: Lsn, + ctx: &RequestContext, ) -> Result { let key = twophase_file_key(xid); - let buf = self.get(key, lsn).await?; + let buf = self.get(key, lsn, ctx).await?; Ok(buf) } pub async fn list_twophase_files( &self, lsn: Lsn, + ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry - let buf = self.get(TWOPHASEDIR_KEY, lsn).await?; + let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; match TwoPhaseDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.xids), @@ -452,12 +473,20 @@ impl Timeline { } } - pub async fn get_control_file(&self, lsn: Lsn) -> Result { - self.get(CONTROLFILE_KEY, lsn).await + pub async fn get_control_file( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { + self.get(CONTROLFILE_KEY, lsn, ctx).await } - pub async fn get_checkpoint(&self, lsn: Lsn) -> Result { - self.get(CHECKPOINT_KEY, lsn).await + pub async fn get_checkpoint( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { + self.get(CHECKPOINT_KEY, lsn, ctx).await } /// Does the same as get_current_logical_size but counted on demand. @@ -469,15 +498,16 @@ impl Timeline { &self, lsn: Lsn, cancel: CancellationToken, + ctx: &RequestContext, ) -> Result { // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?; + let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self - .list_rels(*spcnode, *dbnode, lsn) + .list_rels(*spcnode, *dbnode, lsn, ctx) .await .context("list rels")? { @@ -486,9 +516,9 @@ impl Timeline { } let relsize_key = rel_size_to_key(rel); let mut buf = self - .get(relsize_key, lsn) + .get(relsize_key, lsn, ctx) .await - .context("read relation size of {rel:?}")?; + .with_context(|| format!("read relation size of {rel:?}"))?; let relsize = buf.get_u32_le(); total_size += relsize as u64; @@ -501,7 +531,11 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { + pub async fn collect_keyspace( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -509,7 +543,7 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn).await?; + let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); @@ -519,14 +553,14 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn) + .list_rels(spcnode, dbnode, lsn, ctx) .await? .into_iter() .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn).await?; + let mut buf = self.get(relsize_key, lsn, ctx).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -542,13 +576,13 @@ impl Timeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn).await?; + let buf = self.get(slrudir_key, lsn, ctx).await?; let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn).await?; + let mut buf = self.get(segsize_key, lsn, ctx).await?; let segsize = buf.get_u32_le(); result.add_range( @@ -560,7 +594,7 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn).await?; + let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); @@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> { spcnode: Oid, dbnode: Oid, img: Bytes, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory (if it doesn't exist already) - let buf = self.get(DBDIR_KEY).await?; + let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); @@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> { &mut self, xid: TransactionId, img: Bytes, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY).await?; + let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); @@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> { + pub async fn drop_dbdir( + &mut self, + spcnode: Oid, + dbnode: Oid, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let req_lsn = self.tline.get_last_record_lsn(); let total_blocks = self .tline - .get_db_size(spcnode, dbnode, req_lsn, true) + .get_db_size(spcnode, dbnode, req_lsn, true, ctx) .await?; // Remove entry from dbdir - let buf = self.get(DBDIR_KEY).await?; + let buf = self.get(DBDIR_KEY, ctx).await?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; @@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, nblocks: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { // Didn't exist. Update dbdir @@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key).await?)? + RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; // Add the new relation to the rel directory entry, and write it back @@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, nblocks: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true).await? { + if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? { let size_key = rel_size_to_key(rel); // Fetch the old size first - let old_size = self.get(size_key).await?.get_u32_le(); + let old_size = self.get(size_key, ctx).await?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); @@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, nblocks: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Put size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key).await?.get_u32_le(); + let old_size = self.get(size_key, ctx).await?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { @@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> { } /// Drop a relation. - pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> { + pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Remove it from the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key).await?; + let buf = self.get(dir_key, ctx).await?; let mut dir = RelDirectory::des(&buf)?; if dir.rels.remove(&(rel.relnode, rel.forknum)) { @@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key).await?.get_u32_le(); + let old_size = self.get(size_key, ctx).await?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache @@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, nblocks: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key).await?; + let buf = self.get(dir_key, ctx).await?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { @@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> { } /// This method is used for marking truncated SLRU files - pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> { + pub async fn drop_slru_segment( + &mut self, + kind: SlruKind, + segno: u32, + ctx: &RequestContext, + ) -> anyhow::Result<()> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key).await?; + let buf = self.get(dir_key, ctx).await?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { @@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> { } /// This method is used for marking truncated SLRU files - pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + pub async fn drop_twophase_file( + &mut self, + xid: TransactionId, + ctx: &RequestContext, + ) -> anyhow::Result<()> { // Remove it from the directory entry - let buf = self.get(TWOPHASEDIR_KEY).await?; + let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> { // Internal helper functions to batch the modifications - async fn get(&self, key: Key) -> Result { + async fn get(&self, key: Key, ctx: &RequestContext) -> Result { // Have we already updated the same key? Read the pending updated // version in that case. // @@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> { } } else { let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - self.tline.get(key, lsn).await + self.tline.get(key, lsn, ctx).await } } @@ -1542,10 +1596,11 @@ pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, pg_version: u32, + ctx: &RequestContext, ) -> anyhow::Result> { let tline = tenant - .create_empty_timeline(timeline_id, Lsn(8), pg_version)? - .initialize()?; + .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)? + .initialize(ctx)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; @@ -1598,7 +1653,7 @@ mod tests { assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); // Create a branch, check that the relation is visible there - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?; let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { Some(timeline) => timeline, None => panic!("Should have a local timeline"), diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 586fd20886..092503b7c5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -37,6 +37,17 @@ impl Key { | self.field6 as i128 } + pub fn from_i128(x: i128) -> Self { + Key { + field1: ((x >> 120) & 0xf) as u8, + field2: ((x >> 104) & 0xFFFF) as u32, + field3: (x >> 72) as u32, + field4: (x >> 40) as u32, + field5: (x >> 32) as u8, + field6: x as u32, + } + } + pub fn next(&self) -> Key { self.add(1) } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 02e2e2ee14..09716ba0e0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -171,6 +171,9 @@ task_local! { /// #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum TaskKind { + // Pageserver startup, i.e., `main` + Startup, + // libpq listener task. It just accepts connection and spawns a // PageRequestHandler task for each connection. LibpqEndpointListener, @@ -183,13 +186,37 @@ pub enum TaskKind { // associated with one later, after receiving a command from the client. PageRequestHandler, - // Manages the WAL receiver connection for one timeline. It subscribes to - // events from storage_broker, decides which safekeeper to connect to. It spawns a - // separate WalReceiverConnection task to handle each connection. + /// Manages the WAL receiver connection for one timeline. + /// It subscribes to events from storage_broker and decides which safekeeper to connect to. + /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library. + /// There is at most one connection at any given time. + /// + /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`. + /// The `Client` object is what library users use to make requests & get responses. + /// Internally, `Client` hands over requests to the `Connection` object. + /// The `Connection` object is responsible for speaking the wire protocol. + /// + /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. + /// That abstraction doesn't use `task_mgr`. + /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task. + /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. + /// + /// Once the connection is established, the `TaskHandle` task creates a + /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// the `Connection` object. + /// A `CancellationToken` created by the `TaskHandle` task ensures + /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. WalReceiverManager, - // Handles a connection to a safekeeper, to stream WAL to a timeline. - WalReceiverConnection, + /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`]. + /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. + /// See the comment on [`WalReceiverManager`]. + WalReceiverConnectionHandler, + + /// The task that polls the `tokio-postgres::Connection` object. + /// Spawned by task [`WalReceiverConnectionHandler`]. + /// See the comment on [`WalReceiverManager`]. + WalReceiverConnectionPoller, // Garbage collection worker. One per tenant GarbageCollector, @@ -200,6 +227,8 @@ pub enum TaskKind { // Initial logical size calculation InitialLogicalSizeCalculation, + OndemandLogicalSizeCalculation, + // Task that flushes frozen in-memory layers to disk LayerFlushTask, @@ -222,6 +251,12 @@ pub enum TaskKind { DownloadAllRemoteLayers, // Task that calculates synthetis size for all active tenants CalculateSyntheticSize, + + // A request that comes in via the pageserver HTTP API. + MgmtRequest, + + #[cfg(test)] + UnitTest, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c18c645e5b..2f45fe0dfc 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -48,9 +48,10 @@ use std::time::{Duration, Instant}; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir; use crate::is_uninit_mark; -use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; +use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC}; use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; @@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> { /// /// The new timeline is initialized in Active state, and its background jobs are /// started - pub fn initialize(self) -> anyhow::Result> { + pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result> { let mut timelines = self.owning_tenant.timelines.lock().unwrap(); self.initialize_with_lock(&mut timelines, true, true) } @@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> { mut self, timelines: &mut HashMap>, load_layer_map: bool, - launch_wal_receiver: bool, + activate: bool, ) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_id = self.owning_tenant.tenant_id; @@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> { "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" ) })?; - new_timeline.set_state(TimelineState::Active); v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); - if launch_wal_receiver { - new_timeline.launch_wal_receiver(); + if activate { + new_timeline.activate(); } } } @@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> { self, copyin_stream: &mut (impl Stream> + Sync + Send + Unpin), base_lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; let mut reader = tokio_util::io::StreamReader::new(copyin_stream); - import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn) + import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx) .await .context("Failed to import basebackup")?; @@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - let timeline = self.initialize()?; - - Ok(timeline) + self.initialize(ctx) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -450,6 +449,7 @@ impl Tenant { /// /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success, /// it is marked as Active. + #[allow(clippy::too_many_arguments)] async fn timeline_init_and_sync( &self, timeline_id: TimelineId, @@ -458,6 +458,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, + _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -573,6 +574,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: GenericRemoteStorage, + ctx: &RequestContext, ) -> Arc { // XXX: Attach should provide the config, especially during tenant migration. // See https://github.com/neondatabase/neon/issues/1555 @@ -591,6 +593,7 @@ impl Tenant { // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); + let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn); task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, @@ -599,7 +602,7 @@ impl Tenant { "attach tenant", false, async move { - match tenant_clone.attach().await { + match tenant_clone.attach(ctx).await { Ok(_) => {} Err(e) => { tenant_clone.set_broken(&e.to_string()); @@ -615,8 +618,8 @@ impl Tenant { /// /// Background task that downloads all data for a tenant and brings it to Active state. /// - #[instrument(skip(self), fields(tenant_id=%self.tenant_id))] - async fn attach(self: &Arc) -> anyhow::Result<()> { + #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))] + async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { // Create directory with marker file to indicate attaching state. // The load_local_tenants() function in tenant::mgr relies on the marker file // to determine whether a tenant has finished attaching. @@ -716,6 +719,7 @@ impl Tenant { index_parts.remove(&timeline_id).unwrap(), remote_metadata, remote_clients.remove(&timeline_id).unwrap(), + &ctx, ) .await .with_context(|| { @@ -765,6 +769,7 @@ impl Tenant { index_part: IndexPart, remote_metadata: TimelineMetadata, remote_client: RemoteTimelineClient, + ctx: &RequestContext, ) -> anyhow::Result<()> { info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) @@ -799,6 +804,7 @@ impl Tenant { local_metadata, ancestor, true, + ctx, ) .await } @@ -827,11 +833,12 @@ impl Tenant { /// If the loading fails for some reason, the Tenant will go into Broken /// state. /// - #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))] + #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))] pub fn spawn_load( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: Option, + ctx: &RequestContext, ) -> Arc { let tenant_conf = match Self::load_tenant_config(conf, tenant_id) { Ok(conf) => conf, @@ -855,6 +862,7 @@ impl Tenant { // Do all the hard work in a background task let tenant_clone = Arc::clone(&tenant); + let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn); let _ = task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::InitialLoad, @@ -863,7 +871,7 @@ impl Tenant { "initial tenant load", false, async move { - match tenant_clone.load().await { + match tenant_clone.load(&ctx).await { Ok(()) => {} Err(err) => { tenant_clone.set_broken(&err.to_string()); @@ -884,8 +892,8 @@ impl Tenant { /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. /// - #[instrument(skip(self), fields(tenant_id=%self.tenant_id))] - async fn load(self: &Arc) -> anyhow::Result<()> { + #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))] + async fn load(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { info!("loading tenant task"); utils::failpoint_sleep_millis_async!("before-loading-tenant"); @@ -996,7 +1004,7 @@ impl Tenant { // 1. "Timeline has no ancestor and no layer files" for (timeline_id, local_metadata) in sorted_timelines { - self.load_local_timeline(timeline_id, local_metadata) + self.load_local_timeline(timeline_id, local_metadata, ctx) .await .with_context(|| format!("load local timeline {timeline_id}"))?; } @@ -1013,11 +1021,12 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))] + #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))] async fn load_local_timeline( &self, timeline_id: TimelineId, local_metadata: TimelineMetadata, + ctx: &RequestContext, ) -> anyhow::Result<()> { let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) @@ -1061,6 +1070,7 @@ impl Tenant { Some(local_metadata), ancestor, false, + ctx, ) .await } @@ -1112,6 +1122,7 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, + _ctx: &RequestContext, ) -> anyhow::Result { anyhow::ensure!( self.is_active(), @@ -1153,6 +1164,7 @@ impl Tenant { ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, + ctx: &RequestContext, ) -> anyhow::Result>> { anyhow::ensure!( self.is_active(), @@ -1190,13 +1202,16 @@ impl Tenant { // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. - ancestor_timeline.wait_lsn(*lsn).await?; + ancestor_timeline.wait_lsn(*lsn, ctx).await?; } - self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn) + self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) + .await? + } + None => { + self.bootstrap_timeline(new_timeline_id, pg_version, ctx) .await? } - None => self.bootstrap_timeline(new_timeline_id, pg_version).await?, }; Ok(Some(loaded_timeline)) @@ -1220,30 +1235,25 @@ impl Tenant { target_timeline_id: Option, horizon: u64, pitr: Duration, + ctx: &RequestContext, ) -> anyhow::Result { anyhow::ensure!( self.is_active(), "Cannot run GC iteration on inactive tenant" ); - let timeline_str = target_timeline_id - .map(|x| x.to_string()) - .unwrap_or_else(|| "-".to_string()); + let gc_result = self + .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) + .await; - { - let _timer = STORAGE_TIME - .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) - .start_timer(); - self.gc_iteration_internal(target_timeline_id, horizon, pitr) - .await - } + gc_result } /// Perform one compaction iteration. /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. - pub async fn compaction_iteration(&self) -> anyhow::Result<()> { + pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> { anyhow::ensure!( self.is_active(), "Cannot run compaction iteration on inactive tenant" @@ -1265,7 +1275,7 @@ impl Tenant { for (timeline_id, timeline) in &timelines_to_compact { timeline - .compact() + .compact(ctx) .instrument(info_span!("compact_timeline", timeline = %timeline_id)) .await?; } @@ -1298,7 +1308,11 @@ impl Tenant { } /// Removes timeline-related in-memory data - pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> { + pub async fn delete_timeline( + &self, + timeline_id: TimelineId, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. let timeline = { @@ -1462,8 +1476,7 @@ impl Tenant { tasks::start_background_loops(self.tenant_id); for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Active); - timeline.launch_wal_receiver(); + timeline.activate(); } } } @@ -1487,7 +1500,7 @@ impl Tenant { .values() .filter(|timeline| timeline.current_state() != TimelineState::Broken); for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Suspended); + timeline.set_state(TimelineState::Stopping); } } TenantState::Broken => { @@ -1717,7 +1730,33 @@ impl Tenant { tenant_id: TenantId, remote_storage: Option, ) -> Tenant { - let (state, _) = watch::channel(state); + let (state, mut rx) = watch::channel(state); + + tokio::spawn(async move { + let current_state = *rx.borrow_and_update(); + let tid = tenant_id.to_string(); + TENANT_STATE_METRIC + .with_label_values(&[&tid, current_state.as_str()]) + .inc(); + loop { + match rx.changed().await { + Ok(()) => { + let new_state = *rx.borrow(); + TENANT_STATE_METRIC + .with_label_values(&[&tid, current_state.as_str()]) + .dec(); + TENANT_STATE_METRIC + .with_label_values(&[&tid, new_state.as_str()]) + .inc(); + } + Err(_sender_dropped_error) => { + info!("Tenant dropped the state updates sender, quitting waiting for tenant state change"); + return; + } + } + } + }); + Tenant { tenant_id, conf, @@ -1776,69 +1815,70 @@ impl Tenant { } pub(super) fn persist_tenant_config( + tenant_id: &TenantId, target_config_path: &Path, tenant_conf: TenantConfOpt, - first_save: bool, + creating_tenant: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - info!("persisting tenantconf to {}", target_config_path.display()); - // TODO this will prepend comments endlessly ? - let mut conf_content = r#"# This file contains a specific per-tenant's config. -# It is read in case of pageserver restart. - -[tenant_config] -"# - .to_string(); - - // Convert the config to a toml file. - conf_content += &toml_edit::easy::to_string(&tenant_conf)?; - - let mut target_config_file = VirtualFile::open_with_options( - target_config_path, - OpenOptions::new() - .truncate(true) // This needed for overwriting with small config files - .write(true) - .create_new(first_save), - )?; - - target_config_file - .write(conf_content.as_bytes()) - .context("Failed to write toml bytes into file") - .and_then(|_| { - target_config_file - .sync_all() - .context("Faile to fsync config file") - }) - .with_context(|| { + // imitate a try-block with a closure + let do_persist = |target_config_path: &Path| -> anyhow::Result<()> { + let target_config_parent = target_config_path.parent().with_context(|| { format!( - "Failed to write config file into path '{}'", + "Config path does not have a parent: {}", target_config_path.display() ) })?; - // fsync the parent directory to ensure the directory entry is durable - if first_save { - target_config_path - .parent() - .context("Config file does not have a parent") - .and_then(|target_config_parent| { - File::open(target_config_parent).context("Failed to open config parent") - }) - .and_then(|tenant_dir| { - tenant_dir - .sync_all() - .context("Failed to fsync config parent") - }) - .with_context(|| { - format!( - "Failed to fsync on first save for config {}", - target_config_path.display() - ) - })?; - } + info!("persisting tenantconf to {}", target_config_path.display()); - Ok(()) + let mut conf_content = r#"# This file contains a specific per-tenant's config. +# It is read in case of pageserver restart. + +[tenant_config] +"# + .to_string(); + + // Convert the config to a toml file. + conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + + let mut target_config_file = VirtualFile::open_with_options( + target_config_path, + OpenOptions::new() + .truncate(true) // This needed for overwriting with small config files + .write(true) + .create_new(creating_tenant) + // when creating a new tenant, first_save will be true and `.create(true)` will be + // ignored (per rust std docs). + // + // later when updating the config of created tenant, or persisting config for the + // first time for attached tenant, the `.create(true)` is used. + .create(true), + )?; + + target_config_file + .write(conf_content.as_bytes()) + .context("write toml bytes into file") + .and_then(|_| target_config_file.sync_all().context("fsync config file")) + .context("write config file")?; + + // fsync the parent directory to ensure the directory entry is durable. + // before this was done conditionally on creating_tenant, but these management actions are rare + // enough to just fsync it always. + + crashsafe::fsync(target_config_parent)?; + Ok(()) + }; + + // this function is called from creating the tenant and updating the tenant config, which + // would otherwise share this context, so keep it here in one place. + do_persist(target_config_path).with_context(|| { + format!( + "write tenant {tenant_id} config to {}", + target_config_path.display() + ) + }) } // @@ -1871,12 +1911,13 @@ impl Tenant { target_timeline_id: Option, horizon: u64, pitr: Duration, + ctx: &RequestContext, ) -> anyhow::Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); let gc_timelines = self - .refresh_gc_info_internal(target_timeline_id, horizon, pitr) + .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx) .await?; utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); @@ -1917,7 +1958,10 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub async fn refresh_gc_info(&self) -> anyhow::Result>> { + pub async fn refresh_gc_info( + &self, + ctx: &RequestContext, + ) -> anyhow::Result>> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -1927,7 +1971,7 @@ impl Tenant { // refresh all timelines let target_timeline_id = None; - self.refresh_gc_info_internal(target_timeline_id, horizon, pitr) + self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx) .await } @@ -1936,6 +1980,7 @@ impl Tenant { target_timeline_id: Option, horizon: u64, pitr: Duration, + ctx: &RequestContext, ) -> anyhow::Result>> { // grab mutex to prevent new timelines from being created here. let gc_cs = self.gc_cs.lock().await; @@ -2007,7 +2052,9 @@ impl Tenant { )) .map(|&x| x.1) .collect(); - timeline.update_gc_info(branchpoints, cutoff, pitr).await?; + timeline + .update_gc_info(branchpoints, cutoff, pitr, ctx) + .await?; gc_timelines.push(timeline); } @@ -2019,53 +2066,53 @@ impl Tenant { /// Branch an existing timeline async fn branch_timeline( &self, - src: TimelineId, - dst: TimelineId, + src_timeline: &Arc, + dst_id: TimelineId, start_lsn: Option, + _ctx: &RequestContext, ) -> anyhow::Result> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().await; - let timeline_uninit_mark = { - let timelines = self.timelines.lock().unwrap(); - self.create_timeline_uninit_mark(dst, &timelines)? - }; - - // In order for the branch creation task to not wait for GC/compaction, - // we need to make sure that the starting LSN of the child branch is not out of scope midway by - // - // 1. holding the GC lock to prevent overwritting timeline's GC data - // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline - // - // Step 2 is to avoid initializing the new branch using data removed by past GC iterations - // or in-queue GC iterations. - - let src_timeline = self.get_timeline(src, false).with_context(|| { - format!( - "No ancestor {} found for timeline {}/{}", - src, self.tenant_id, dst - ) - })?; - - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + let src_id = src_timeline.timeline_id; // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN let start_lsn = start_lsn.unwrap_or_else(|| { let lsn = src_timeline.get_last_record_lsn(); - info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}"); lsn }); - // Check if the starting LSN is out of scope because it is less than - // 1. the latest GC cutoff LSN or - // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + // First acquire the GC lock so that another task cannot advance the GC + // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are + // creating the branch. + let _gc_cs = self.gc_cs.lock().await; + + // Create a placeholder for the new branch. This will error + // out if the new timeline ID is already in use. + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(dst_id, &timelines)? + }; + + // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR + // horizon on the source timeline + // + // We check it against both the planned GC cutoff stored in 'gc_info', + // and the 'latest_gc_cutoff' of the last GC that was performed. The + // planned GC cutoff in 'gc_info' is normally larger than + // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just + // changed the GC settings for the tenant to make the PITR window + // larger, but some of the data was already removed by an earlier GC + // iteration. + + // check against last actual 'latest_gc_cutoff' first + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", *latest_gc_cutoff_lsn, ))?; + + // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); @@ -2076,6 +2123,12 @@ impl Tenant { } } + // + // The branch point is valid, and we are still holding the 'gc_cs' lock + // so that GC cannot advance the GC cutoff until we are finished. + // Proceed with the branch creation. + // + // Determine prev-LSN for the new timeline. We can only determine it if // the timeline was branched at the current end of the source timeline. let RecordLsn { @@ -2094,7 +2147,7 @@ impl Tenant { let metadata = TimelineMetadata::new( start_lsn, dst_prev, - Some(src), + Some(src_id), start_lsn, *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, @@ -2103,15 +2156,15 @@ impl Tenant { let mut timelines = self.timelines.lock().unwrap(); let new_timeline = self .prepare_timeline( - dst, + dst_id, metadata, timeline_uninit_mark, false, - Some(src_timeline), + Some(Arc::clone(src_timeline)), )? .initialize_with_lock(&mut timelines, true, true)?; drop(timelines); - info!("branched timeline {dst} from {src} at {start_lsn}"); + info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); Ok(new_timeline) } @@ -2122,6 +2175,7 @@ impl Tenant { &self, timeline_id: TimelineId, pg_version: u32, + ctx: &RequestContext, ) -> anyhow::Result> { let timeline_uninit_mark = { let timelines = self.timelines.lock().unwrap(); @@ -2181,6 +2235,7 @@ impl Tenant { unfinished_timeline, pgdata_path, pgdata_lsn, + ctx, ) .await .with_context(|| { @@ -2352,7 +2407,10 @@ impl Tenant { /// /// Future is cancellation safe. Only one calculation can be running at once per tenant. #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] - pub async fn gather_size_inputs(&self) -> anyhow::Result { + pub async fn gather_size_inputs( + &self, + ctx: &RequestContext, + ) -> anyhow::Result { let logical_sizes_at_once = self .conf .concurrent_tenant_size_logical_size_queries @@ -2364,15 +2422,15 @@ impl Tenant { // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await + size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await } /// Calculate synthetic tenant size /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] - pub async fn calculate_synthetic_size(&self) -> anyhow::Result { - let inputs = self.gather_size_inputs().await?; + pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result { + let inputs = self.gather_size_inputs(ctx).await?; let size = inputs.calculate()?; @@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir( target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?; + .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?; let temporary_tenant_config_path = rebase_directory( &conf.tenant_config_path(tenant_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?; + .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + + Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?; - Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context( - || { - format!( - "Failed to write tenant {} config to {}", - tenant_id, - temporary_tenant_config_path.display() - ) - }, - )?; crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( - "could not create tenant {} temporary timelines directory {}", + "create tenant {} temporary timelines directory {}", tenant_id, temporary_tenant_timelines_dir.display() ) @@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir( fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( - "failed to move tenant {} temporary directory {} into the permanent one {}", + "move tenant {} temporary directory {} into the permanent one {}", tenant_id, temporary_tenant_dir.display(), target_tenant_directory.display() @@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir( })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( - "Failed to get tenant {} dir parent for {}", + "get tenant {} dir parent for {}", tenant_id, target_tenant_directory.display() ) })?; crashsafe::fsync(target_dir_parent).with_context(|| { format!( - "Failed to fsync renamed directory's parent {} for tenant {}", + "fsync renamed directory's parent {} for tenant {}", target_dir_parent.display(), tenant_id, ) @@ -2743,11 +2794,17 @@ pub mod harness { }) } - pub async fn load(&self) -> Arc { - self.try_load().await.expect("failed to load test tenant") + pub async fn load(&self) -> (Arc, RequestContext) { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + ( + self.try_load(&ctx) + .await + .expect("failed to load test tenant"), + ctx, + ) } - pub async fn try_load(&self) -> anyhow::Result> { + pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { let walredo_mgr = Arc::new(TestRedoManager); let tenant = Arc::new(Tenant::new( @@ -2775,8 +2832,7 @@ pub mod harness { timelines_to_load.insert(timeline_id, timeline_metadata); } // FIXME starts background jobs - tenant.load().await?; - + tenant.load(ctx).await?; Ok(tenant) } @@ -2833,10 +2889,9 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_basic")?.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -2849,15 +2904,15 @@ mod tests { drop(writer); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x10)).await?, + tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, TEST_IMG("foo at 0x10") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x1f)).await?, + tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, TEST_IMG("foo at 0x10") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x20)).await?, + tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, TEST_IMG("foo at 0x20") ); @@ -2866,14 +2921,14 @@ mod tests { #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { - let tenant = TenantHarness::create("no_duplicate_timelines")? + let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? .load() .await; - let _ = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let timeline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = timeline.initialize(&ctx)?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -2899,13 +2954,13 @@ mod tests { /// #[tokio::test] async fn test_branch() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_branch")?.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; - let writer = tline.writer(); use std::str::from_utf8; + let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; + let writer = tline.writer(); + #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); #[allow(non_snake_case)] @@ -2925,7 +2980,7 @@ mod tests { // Branch the history, modify relation differently on the new timeline tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -2936,15 +2991,15 @@ mod tests { // Check page contents on both branches assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?, + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?, "foo at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?, + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?, "bar at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?, + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?, "foobar at 0x20" ); @@ -2996,13 +3051,12 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { - let tenant = + let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load() .await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -3010,12 +3064,12 @@ mod tests { // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) .await?; // try to branch at lsn 25, should fail because we already garbage collected the data match tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), @@ -3034,16 +3088,17 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? - .load() - .await; + let (tenant, ctx) = + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? + .load() + .await; - tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? - .initialize()?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), @@ -3085,40 +3140,40 @@ mod tests { #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? - .load() - .await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? + .load() + .await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) .await?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok()); + assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok()); Ok(()) } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")? - .load() - .await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = + TenantHarness::create("test_parent_keeps_data_forever_after_branching")? + .load() + .await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3128,12 +3183,12 @@ mod tests { // run gc on parent tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) .await?; // Check that the data is still accessible on the branch. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50)).await?, + newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, TEST_IMG(&format!("foo at {}", Lsn(0x40))) ); @@ -3145,14 +3200,14 @@ mod tests { const TEST_NAME: &str = "timeline_load"; let harness = TenantHarness::create(TEST_NAME)?; { - let tenant = harness.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = harness.load().await; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; } - let tenant = harness.load().await; + let (tenant, _ctx) = harness.load().await; tenant .get_timeline(TIMELINE_ID, true) .expect("cannot load timeline"); @@ -3166,15 +3221,15 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; // create two timelines { - let tenant = harness.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = harness.load().await; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant @@ -3185,7 +3240,7 @@ mod tests { } // check that both of them are initially unloaded - let tenant = harness.load().await; + let (tenant, _ctx) = harness.load().await; // check that both, child and ancestor are loaded let _child_tline = tenant @@ -3203,11 +3258,11 @@ mod tests { async fn corrupt_metadata() -> anyhow::Result<()> { const TEST_NAME: &str = "corrupt_metadata"; let harness = TenantHarness::create(TEST_NAME)?; - let tenant = harness.load().await; + let (tenant, ctx) = harness.load().await; tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -3219,7 +3274,7 @@ mod tests { metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let err = harness.try_load().await.err().expect("should fail"); + let err = harness.try_load(&ctx).await.err().expect("should fail"); assert!(err .to_string() .starts_with("Failed to parse metadata bytes from path")); @@ -3243,10 +3298,9 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_images")?.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -3254,7 +3308,7 @@ mod tests { drop(writer); tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; @@ -3262,7 +3316,7 @@ mod tests { drop(writer); tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; @@ -3270,7 +3324,7 @@ mod tests { drop(writer); tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; @@ -3278,26 +3332,26 @@ mod tests { drop(writer); tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; assert_eq!( - tline.get(*TEST_KEY, Lsn(0x10)).await?, + tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, TEST_IMG("foo at 0x10") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x1f)).await?, + tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, TEST_IMG("foo at 0x10") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x20)).await?, + tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, TEST_IMG("foo at 0x20") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x30)).await?, + tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?, TEST_IMG("foo at 0x30") ); assert_eq!( - tline.get(*TEST_KEY, Lsn(0x40)).await?, + tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?, TEST_IMG("foo at 0x40") ); @@ -3310,10 +3364,9 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_bulk_insert")?.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; let mut lsn = Lsn(0x10); @@ -3342,10 +3395,10 @@ mod tests { let cutoff = tline.get_last_record_lsn(); tline - .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx) .await?; tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; tline.gc().await?; } @@ -3354,10 +3407,9 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_random_updates")?.load().await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tline.initialize(&ctx)?; const NUM_KEYS: usize = 1000; @@ -3407,7 +3459,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn).await?, + tline.get(test_key, lsn, &ctx).await?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3415,10 +3467,10 @@ mod tests { // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); tline - .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx) .await?; tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; tline.gc().await?; } @@ -3427,12 +3479,12 @@ mod tests { #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_traverse_branches")? + let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? .load() .await; let mut tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; const NUM_KEYS: usize = 1000; @@ -3462,16 +3514,14 @@ mod tests { keyspace.add_key(test_key); } - let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = TimelineId::generate(); tenant - .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); - tline_id = new_tline_id; for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); @@ -3493,7 +3543,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn).await?, + tline.get(test_key, lsn, &ctx).await?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3501,10 +3551,10 @@ mod tests { // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); tline - .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx) .await?; tline.freeze_and_flush().await?; - tline.compact().await?; + tline.compact(&ctx).await?; tline.gc().await?; } @@ -3513,12 +3563,12 @@ mod tests { #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { - let tenant = TenantHarness::create("test_traverse_ancestors")? + let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? .load() .await; let mut tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -3528,18 +3578,16 @@ mod tests { let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES]; let mut lsn = Lsn(0); - let mut tline_id = TIMELINE_ID; #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = TimelineId::generate(); tenant - .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); - tline_id = new_tline_id; for _ in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); @@ -3568,7 +3616,7 @@ mod tests { println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, *lsn).await?, + tline.get(test_key, *lsn, &ctx).await?, TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c95a98fbc7..e66ee0ae36 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -28,7 +28,12 @@ pub mod defaults { pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - pub const DEFAULT_GC_PERIOD: &str = "100 s"; + + // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. + // If there's a need to decrease this value, first make sure that GC + // doesn't hold a layer map write lock for non-trivial operations. + // Relevant: https://github.com/neondatabase/neon/issues/3394 + pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 01c5359e88..ed1a32c8fd 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -9,24 +9,57 @@ //! are frozen, and it is split up into new image and delta layers and the //! corresponding files are written to disk. //! +//! Design overview: +//! +//! The `search` method of the layer map is on the read critical path, so we've +//! built an efficient data structure for fast reads, stored in `LayerMap::historic`. +//! Other read methods are less critical but still impact performance of background tasks. +//! +//! This data structure relies on a persistent/immutable binary search tree. See the +//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s +//! Summary: A persistent/immutable BST (and persistent data structures in general) allows +//! you to modify the tree in such a way that each modification creates a new "version" +//! of the tree. When you modify it, you get a new version, but all previous versions are +//! still accessible too. So if someone is still holding a reference to an older version, +//! they continue to see the tree as it was then. The persistent BST stores all the +//! different versions in an efficient way. +//! +//! Our persistent BST maintains a map of which layer file "covers" each key. It has only +//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property +//! to handle the LSN dimension. +//! +//! To build the layer map, we insert each layer to the persistent BST in LSN.start order, +//! starting from the oldest one. After each insertion, we grab a reference to that "version" +//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See +//! `historic_layer_coverage.rs`. +//! +//! To search for a particular key-LSN pair, you first look up the right "version" in the +//! BTreeMap. Then you search that version of the BST with the key. +//! +//! The persistent BST keeps all the versions, but there is no way to change the old versions +//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in +//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need +//! to throw away most of the persistent BST and build a new one, starting from the oldest +//! LSN. See `LayerMap::flush_updates()`. +//! +mod historic_layer_coverage; +mod layer_coverage; + +use crate::keyspace::KeyPartitioning; use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; -use crate::tenant::storage_layer::{range_eq, range_overlaps}; -use amplify_num::i256; +use crate::tenant::storage_layer::InMemoryLayer; +use crate::tenant::storage_layer::Layer; use anyhow::Result; -use num_traits::identities::{One, Zero}; -use num_traits::{Bounded, Num, Signed}; -use rstar::{RTree, RTreeObject, AABB}; -use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; -use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; -use tracing::*; use utils::lsn::Lsn; -use super::storage_layer::{InMemoryLayer, Layer}; +use historic_layer_coverage::BufferedHistoricLayerCoverage; + +use super::storage_layer::range_eq; /// /// LayerMap tracks what layers exist on a timeline. @@ -51,8 +84,8 @@ pub struct LayerMap { /// pub frozen_layers: VecDeque>, - /// All the historic layers are kept here - historic_layers: RTree>, + /// Index of the historic layers optimized for search + historic: BufferedHistoricLayerCoverage>, /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. @@ -65,177 +98,64 @@ impl Default for LayerMap { open_layer: None, next_open_layer_at: None, frozen_layers: VecDeque::default(), - historic_layers: RTree::default(), l0_delta_layers: Vec::default(), + historic: BufferedHistoricLayerCoverage::default(), } } } -struct LayerRTreeObject { - layer: Arc, - - envelope: AABB<[IntKey; 2]>, +/// The primary update API for the layer map. +/// +/// Batching historic layer insertions and removals is good for +/// performance and this struct helps us do that correctly. +#[must_use] +pub struct BatchedUpdates<'a, L: ?Sized + Layer> { + // While we hold this exclusive reference to the layer map the type checker + // will prevent us from accidentally reading any unflushed updates. + layer_map: &'a mut LayerMap, } -// Representation of Key as numeric type. -// We can not use native implementation of i128, because rstar::RTree -// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). -// Overflow will cause panic in debug mode and incorrect area calculation in release mode, -// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). -// By using i256 as the type, even though all the actual values would fit in i128, we can be -// sure that multiplication doesn't overflow. -// - -#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] -struct IntKey(i256); - -impl Copy for IntKey {} - -impl IntKey { - fn from(i: i128) -> Self { - IntKey(i256::from(i)) - } -} - -impl Bounded for IntKey { - fn min_value() -> Self { - IntKey(i256::MIN) - } - fn max_value() -> Self { - IntKey(i256::MAX) - } -} - -impl Signed for IntKey { - fn is_positive(&self) -> bool { - self.0 > i256::ZERO - } - fn is_negative(&self) -> bool { - self.0 < i256::ZERO - } - fn signum(&self) -> Self { - match self.0.cmp(&i256::ZERO) { - Ordering::Greater => IntKey(i256::ONE), - Ordering::Less => IntKey(-i256::ONE), - Ordering::Equal => IntKey(i256::ZERO), - } - } - fn abs(&self) -> Self { - IntKey(self.0.abs()) - } - fn abs_sub(&self, other: &Self) -> Self { - if self.0 <= other.0 { - IntKey(i256::ZERO) - } else { - IntKey(self.0 - other.0) - } - } -} - -impl Neg for IntKey { - type Output = Self; - fn neg(self) -> Self::Output { - IntKey(-self.0) - } -} - -impl Rem for IntKey { - type Output = Self; - fn rem(self, rhs: Self) -> Self::Output { - IntKey(self.0 % rhs.0) - } -} - -impl Div for IntKey { - type Output = Self; - fn div(self, rhs: Self) -> Self::Output { - IntKey(self.0 / rhs.0) - } -} - -impl Add for IntKey { - type Output = Self; - fn add(self, rhs: Self) -> Self::Output { - IntKey(self.0 + rhs.0) - } -} - -impl Sub for IntKey { - type Output = Self; - fn sub(self, rhs: Self) -> Self::Output { - IntKey(self.0 - rhs.0) - } -} - -impl Mul for IntKey { - type Output = Self; - fn mul(self, rhs: Self) -> Self::Output { - IntKey(self.0 * rhs.0) - } -} - -impl One for IntKey { - fn one() -> Self { - IntKey(i256::ONE) - } -} - -impl Zero for IntKey { - fn zero() -> Self { - IntKey(i256::ZERO) - } - fn is_zero(&self) -> bool { - self.0 == i256::ZERO - } -} - -impl Num for IntKey { - type FromStrRadixErr = ::FromStrRadixErr; - fn from_str_radix(str: &str, radix: u32) -> Result { - Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) - } -} - -impl PartialEq for LayerRTreeObject { - fn eq(&self, other: &Self) -> bool { - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - Arc::ptr_eq(&self.layer, &other.layer) - } -} - -impl RTreeObject for LayerRTreeObject -where - L: ?Sized, -{ - type Envelope = AABB<[IntKey; 2]>; - fn envelope(&self) -> Self::Envelope { - self.envelope - } -} - -impl LayerRTreeObject +/// Provide ability to batch more updates while hiding the read +/// API so we don't accidentally read without flushing. +impl BatchedUpdates<'_, L> where L: ?Sized + Layer, { - fn new(layer: Arc) -> Self { - let key_range = layer.get_key_range(); - let lsn_range = layer.get_lsn_range(); + /// + /// Insert an on-disk layer. + /// + pub fn insert_historic(&mut self, layer: Arc) { + self.layer_map.insert_historic_noflush(layer) + } - let envelope = AABB::from_corners( - [ - IntKey::from(key_range.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(key_range.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive - ); - LayerRTreeObject { layer, envelope } + /// + /// Remove an on-disk layer from the map. + /// + /// This should be called when the corresponding file on disk has been deleted. + /// + pub fn remove_historic(&mut self, layer: Arc) { + self.layer_map.remove_historic_noflush(layer) + } + + // We will flush on drop anyway, but this method makes it + // more explicit that there is some work being done. + /// Apply all updates + pub fn flush(self) { + // Flush happens on drop + } +} + +// Ideally the flush() method should be called explicitly for more +// controlled execution. But if we forget we'd rather flush on drop +// than panic later or read without flushing. +// +// TODO maybe warn if flush hasn't explicitly been called +impl Drop for BatchedUpdates<'_, L> +where + L: ?Sized + Layer, +{ + fn drop(&mut self) { + self.layer_map.flush_updates(); } } @@ -281,125 +201,91 @@ where /// 'open' and 'frozen' layers! /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { - // Find the latest image layer that covers the given key - let mut latest_img: Option> = None; - let mut latest_img_lsn: Option = None; - let envelope = AABB::from_corners( - [IntKey::from(key.to_i128()), IntKey::from(0i128)], - [ - IntKey::from(key.to_i128()), - IntKey::from(end_lsn.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - assert!(l.get_key_range().contains(&key)); - let img_lsn = l.get_lsn_range().start; - assert!(img_lsn < end_lsn); - if Lsn(img_lsn.0 + 1) == end_lsn { - // found exact match - return Some(SearchResult { - layer: Arc::clone(l), - lsn_floor: img_lsn, - }); - } - if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { - latest_img = Some(Arc::clone(l)); - latest_img_lsn = Some(img_lsn); - } - } + let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let latest_delta = version.delta_coverage.query(key.to_i128()); + let latest_image = version.image_coverage.query(key.to_i128()); - // Search the delta layers - let mut latest_delta: Option> = None; - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if !l.is_incremental() { - continue; + match (latest_delta, latest_image) { + (None, None) => None, + (None, Some(image)) => { + let lsn_floor = image.get_lsn_range().start; + Some(SearchResult { + layer: image, + lsn_floor, + }) } - assert!(l.get_key_range().contains(&key)); - if l.get_lsn_range().start >= end_lsn { - info!( - "Candidate delta layer {}..{} is too new for lsn {}", - l.get_lsn_range().start, - l.get_lsn_range().end, - end_lsn - ); + (Some(delta), None) => { + let lsn_floor = delta.get_lsn_range().start; + Some(SearchResult { + layer: delta, + lsn_floor, + }) } - assert!(l.get_lsn_range().start < end_lsn); - if l.get_lsn_range().end >= end_lsn { - // this layer contains the requested point in the key/lsn space. - // No need to search any further - trace!( - "found layer {} for request on {key} at {end_lsn}", - l.short_id(), - ); - latest_delta.replace(Arc::clone(l)); - break; - } - if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) { - // this layer's end LSN is smaller than the requested point. If there's - // nothing newer, this is what we need to return. Remember this. - if let Some(old_candidate) = &latest_delta { - if l.get_lsn_range().end > old_candidate.get_lsn_range().end { - latest_delta.replace(Arc::clone(l)); - } + (Some(delta), Some(image)) => { + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { + Some(SearchResult { + layer: image, + lsn_floor: img_lsn, + }) } else { - latest_delta.replace(Arc::clone(l)); + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + Some(SearchResult { + layer: delta, + lsn_floor, + }) } } } - if let Some(l) = latest_delta { - trace!( - "found (old) layer {} for request on {key} at {end_lsn}", - l.short_id(), - ); - let lsn_floor = std::cmp::max( - Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), - l.get_lsn_range().start, - ); - Some(SearchResult { - lsn_floor, - layer: l, - }) - } else if let Some(l) = latest_img { - trace!("found img layer and no deltas for request on {key} at {end_lsn}"); - Some(SearchResult { - lsn_floor: latest_img_lsn.unwrap(), - layer: l, - }) - } else { - trace!("no layer found for request on {key} at {end_lsn}"); - None - } + } + + /// Start a batch of updates, applied on drop + pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> { + BatchedUpdates { layer_map: self } } /// /// Insert an on-disk layer /// - pub fn insert_historic(&mut self, layer: Arc) { - if layer.get_key_range() == (Key::MIN..Key::MAX) { - self.l0_delta_layers.push(layer.clone()); + /// Helper function for BatchedUpdates::insert_historic + /// + pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { + let kr = layer.get_key_range(); + let lr = layer.get_lsn_range(); + self.historic.insert( + historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }, + Arc::clone(&layer), + ); + + if Self::is_l0(&layer) { + self.l0_delta_layers.push(layer); } - self.historic_layers.insert(LayerRTreeObject::new(layer)); + NUM_ONDISK_LAYERS.inc(); } /// /// Remove an on-disk layer from the map. /// - /// This should be called when the corresponding file on disk has been deleted. + /// Helper function for BatchedUpdates::remove_historic /// - pub fn remove_historic(&mut self, layer: Arc) { - if layer.get_key_range() == (Key::MIN..Key::MAX) { + pub fn remove_historic_noflush(&mut self, layer: Arc) { + let kr = layer.get_key_range(); + let lr = layer.get_lsn_range(); + self.historic.remove(historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }); + + if Self::is_l0(&layer) { let len_before = self.l0_delta_layers.len(); // FIXME: ptr_eq might fail to return true for 'dyn' @@ -411,98 +297,57 @@ where .retain(|other| !Arc::ptr_eq(other, &layer)); assert_eq!(self.l0_delta_layers.len(), len_before - 1); } - assert!(self - .historic_layers - .remove(&LayerRTreeObject::new(layer)) - .is_some()); + NUM_ONDISK_LAYERS.dec(); } + /// Helper function for BatchedUpdates::drop. + pub(self) fn flush_updates(&mut self) { + self.historic.rebuild(); + } + /// Is there a newer image layer for given key- and LSN-range? Or a set /// of image layers within the specified lsn range that cover the entire /// specified key range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - pub fn image_layer_exists( - &self, - key_range: &Range, - lsn_range: &Range, - ) -> Result { - let mut range_remain = key_range.clone(); + pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> Result { + if key.is_empty() { + // Vacuously true. There's a newer image for all 0 of the kerys in the range. + return Ok(true); + } - loop { - let mut made_progress = false; - let envelope = AABB::from_corners( - [ - IntKey::from(range_remain.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(range_remain.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - let img_lsn = l.get_lsn_range().start; - if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { - made_progress = true; - let img_key_end = l.get_key_range().end; + let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { + Some(v) => v, + None => return Ok(false), + }; - if img_key_end >= range_remain.end { - return Ok(true); - } - range_remain.start = img_key_end; - } - } + let start = key.start.to_i128(); + let end = key.end.to_i128(); - if !made_progress { + let layer_covers = |layer: Option>| match layer { + Some(layer) => layer.get_lsn_range().start >= lsn.start, + None => false, + }; + + // Check the start is covered + if !layer_covers(version.image_coverage.query(start)) { + return Ok(false); + } + + // Check after all changes of coverage + for (_, change_val) in version.image_coverage.range(start..end) { + if !layer_covers(change_val) { return Ok(false); } } + + Ok(true) } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { - self.historic_layers.iter().map(|e| e.layer.clone()) - } - - /// Find the last image layer that covers 'key', ignoring any image layers - /// newer than 'lsn'. - fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { - let mut candidate_lsn = Lsn(0); - let mut candidate = None; - let envelope = AABB::from_corners( - [IntKey::from(key.to_i128()), IntKey::from(0)], - [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if l.is_incremental() { - continue; - } - - assert!(l.get_key_range().contains(&key)); - let this_lsn = l.get_lsn_range().start; - assert!(this_lsn <= lsn); - if this_lsn < candidate_lsn { - // our previous candidate was better - continue; - } - candidate_lsn = this_lsn; - candidate = Some(Arc::clone(l)); - } - - candidate + self.historic.iter() } /// @@ -518,94 +363,288 @@ where key_range: &Range, lsn: Lsn, ) -> Result, Option>)>> { - let mut points = vec![key_range.start]; - let envelope = AABB::from_corners( - [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], - [ - IntKey::from(key_range.end.to_i128()), - IntKey::from(lsn.0 as i128), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - assert!(l.get_lsn_range().start <= lsn); - let range = l.get_key_range(); - if key_range.contains(&range.start) { - points.push(l.get_key_range().start); - } - if key_range.contains(&range.end) { - points.push(l.get_key_range().end); - } + let version = match self.historic.get().unwrap().get_version(lsn.0) { + Some(v) => v, + None => return Ok(vec![]), + }; + + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); + + // Initialize loop variables + let mut coverage: Vec<(Range, Option>)> = vec![]; + let mut current_key = start; + let mut current_val = version.image_coverage.query(start); + + // Loop through the change events and push intervals + for (change_key, change_val) in version.image_coverage.range(start..end) { + let kr = Key::from_i128(current_key)..Key::from_i128(change_key); + coverage.push((kr, current_val.take())); + current_key = change_key; + current_val = change_val.clone(); } - points.push(key_range.end); - points.sort(); - points.dedup(); + // Add the final interval + let kr = Key::from_i128(current_key)..Key::from_i128(end); + coverage.push((kr, current_val.take())); - // Ok, we now have a list of "interesting" points in the key space - - // For each range between the points, find the latest image - let mut start = *points.first().unwrap(); - let mut ranges = Vec::new(); - for end in points[1..].iter() { - let img = self.find_latest_image(start, lsn); - - ranges.push((start..*end, img)); - - start = *end; - } - Ok(ranges) + Ok(coverage) } - /// Count the height of the tallest stack of deltas in this 2d region. + pub fn is_l0(layer: &L) -> bool { + range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX)) + } + + /// This function determines which layers are counted in `count_deltas`: + /// layers that should count towards deciding whether or not to reimage + /// a certain partition range. + /// + /// There are two kinds of layers we currently consider reimage-worthy: + /// + /// Case 1: Non-L0 layers are currently reimage-worthy by default. + /// TODO Some of these layers are very sparse and cover the entire key + /// range. Replacing 256MB of data (or less!) with terabytes of + /// images doesn't seem wise. We need a better heuristic, possibly + /// based on some of these factors: + /// a) whether this layer has any wal in this partition range + /// b) the size of the layer + /// c) the number of images needed to cover it + /// d) the estimated time until we'll have to reimage over it for GC + /// + /// Case 2: Since L0 layers by definition cover the entire key space, we consider + /// them reimage-worthy only when the entire key space can be covered by very few + /// images (currently 1). + /// TODO The optimal number should probably be slightly higher than 1, but to + /// implement that we need to plumb a lot more context into this function + /// than just the current partition_range. + pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> bool { + // Case 1 + if !Self::is_l0(layer) { + return true; + } + + // Case 2 + if range_eq(partition_range, &(Key::MIN..Key::MAX)) { + return true; + } + + false + } + + /// Count the height of the tallest stack of reimage-worthy deltas + /// in this 2d region. + /// + /// If `limit` is provided we don't try to count above that number. /// /// This number is used to compute the largest number of deltas that /// we'll need to visit for any page reconstruction in this region. /// We use this heuristic to decide whether to create an image layer. - /// - /// TODO currently we just return the total number of deltas in the - /// region, no matter if they're stacked on top of each other - /// or next to each other. - pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { - let mut result = 0; - if lsn_range.start >= lsn_range.end { + pub fn count_deltas( + &self, + key: &Range, + lsn: &Range, + limit: Option, + ) -> Result { + // We get the delta coverage of the region, and for each part of the coverage + // we recurse right underneath the delta. The recursion depth is limited by + // the largest result this function could return, which is in practice between + // 3 and 10 (since we usually try to create an image when the number gets larger). + + if lsn.is_empty() || key.is_empty() || limit == Some(0) { return Ok(0); } - let envelope = AABB::from_corners( - [ - IntKey::from(key_range.start.to_i128()), - IntKey::from(lsn_range.start.0 as i128), - ], - [ - IntKey::from(key_range.end.to_i128() - 1), - IntKey::from(lsn_range.end.0 as i128 - 1), - ], - ); - for e in self - .historic_layers - .locate_in_envelope_intersecting(&envelope) - { - let l = &e.layer; - if !l.is_incremental() { - continue; - } - assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); - assert!(range_overlaps(&l.get_key_range(), key_range)); - // We ignore level0 delta layers. Unless the whole keyspace fits - // into one partition - if !range_eq(key_range, &(Key::MIN..Key::MAX)) - && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) - { - continue; + let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { + Some(v) => v, + None => return Ok(0), + }; + + let start = key.start.to_i128(); + let end = key.end.to_i128(); + + // Initialize loop variables + let mut max_stacked_deltas = 0; + let mut current_key = start; + let mut current_val = version.delta_coverage.query(start); + + // Loop through the delta coverage and recurse on each part + for (change_key, change_val) in version.delta_coverage.range(start..end) { + // If there's a relevant delta in this part, add 1 and recurse down + if let Some(val) = current_val { + if val.get_lsn_range().end > lsn.start { + let kr = Key::from_i128(current_key)..Key::from_i128(change_key); + let lr = lsn.start..val.get_lsn_range().start; + if !kr.is_empty() { + let base_count = Self::is_reimage_worthy(&val, key) as usize; + let new_limit = limit.map(|l| l - base_count); + let max_stacked_deltas_underneath = + self.count_deltas(&kr, &lr, new_limit)?; + max_stacked_deltas = std::cmp::max( + max_stacked_deltas, + base_count + max_stacked_deltas_underneath, + ); + } + } } - result += 1; + current_key = change_key; + current_val = change_val.clone(); } - Ok(result) + + // Consider the last part + if let Some(val) = current_val { + if val.get_lsn_range().end > lsn.start { + let kr = Key::from_i128(current_key)..Key::from_i128(end); + let lr = lsn.start..val.get_lsn_range().start; + + if !kr.is_empty() { + let base_count = Self::is_reimage_worthy(&val, key) as usize; + let new_limit = limit.map(|l| l - base_count); + let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; + max_stacked_deltas = std::cmp::max( + max_stacked_deltas, + base_count + max_stacked_deltas_underneath, + ); + } + } + } + + Ok(max_stacked_deltas) + } + + /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. + /// + /// The `partition_range` argument is used as context for the reimage-worthiness decision. + /// + /// Used as a helper for correctness checks only. Performance not critical. + pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize { + match self.search(key, lsn) { + Some(search_result) => { + if search_result.layer.is_incremental() { + (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize) + + self.get_difficulty(search_result.lsn_floor, key, partition_range) + } else { + 0 + } + } + None => 0, + } + } + + /// Used for correctness checking. Results are expected to be identical to + /// self.get_difficulty_map. Assumes self.search is correct. + pub fn get_difficulty_map_bruteforce( + &self, + lsn: Lsn, + partitioning: &KeyPartitioning, + ) -> Vec { + // Looking at the difficulty as a function of key, it could only increase + // when a delta layer starts or an image layer ends. Therefore it's sufficient + // to check the difficulties at: + // - the key.start for each non-empty part range + // - the key.start for each delta + // - the key.end for each image + let keys_iter: Box> = { + let mut keys: Vec = self + .iter_historic_layers() + .map(|layer| { + if layer.is_incremental() { + layer.get_key_range().start + } else { + layer.get_key_range().end + } + }) + .collect(); + keys.sort(); + Box::new(keys.into_iter()) + }; + let mut keys_iter = keys_iter.peekable(); + + // Iter the partition and keys together and query all the necessary + // keys, computing the max difficulty for each part. + partitioning + .parts + .iter() + .map(|part| { + let mut difficulty = 0; + // Partition ranges are assumed to be sorted and disjoint + // TODO assert it + for range in &part.ranges { + if !range.is_empty() { + difficulty = + std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range)); + } + while let Some(key) = keys_iter.peek() { + if key >= &range.end { + break; + } + let key = keys_iter.next().unwrap(); + if key < range.start { + continue; + } + difficulty = + std::cmp::max(difficulty, self.get_difficulty(lsn, key, range)); + } + } + difficulty + }) + .collect() + } + + /// For each part of a keyspace partitioning, return the maximum number of layers + /// that would be needed for page reconstruction in that part at the given LSN. + /// + /// If `limit` is provided we don't try to count above that number. + /// + /// This method is used to decide where to create new image layers. Computing the + /// result for the entire partitioning at once allows this function to be more + /// efficient, and further optimization is possible by using iterators instead, + /// to allow early return. + /// + /// TODO actually use this method instead of count_deltas. Currently we only use + /// it for benchmarks. + pub fn get_difficulty_map( + &self, + lsn: Lsn, + partitioning: &KeyPartitioning, + limit: Option, + ) -> Vec { + // TODO This is a naive implementation. Perf improvements to do: + // 1. Instead of calling self.image_coverage and self.count_deltas, + // iterate the image and delta coverage only once. + partitioning + .parts + .iter() + .map(|part| { + let mut difficulty = 0; + for range in &part.ranges { + if limit == Some(difficulty) { + break; + } + for (img_range, last_img) in self + .image_coverage(range, lsn) + .expect("why would this err?") + { + if limit == Some(difficulty) { + break; + } + let img_lsn = if let Some(last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + + if img_lsn < lsn { + let num_deltas = self + .count_deltas(&img_range, &(img_lsn..lsn), limit) + .expect("why would this err lol?"); + difficulty = std::cmp::max(difficulty, num_deltas); + } + } + } + difficulty + }) + .collect() } /// Return all L0 delta layers @@ -629,8 +668,8 @@ where } println!("historic_layers:"); - for e in self.historic_layers.iter() { - e.layer.dump(verbose)?; + for layer in self.iter_historic_layers() { + layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs new file mode 100644 index 0000000000..46821aef15 --- /dev/null +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -0,0 +1,583 @@ +use std::collections::BTreeMap; +use std::ops::Range; + +use tracing::info; + +use super::layer_coverage::LayerCoverageTuple; + +/// Layers in this module are identified and indexed by this data. +/// +/// This is a helper struct to enable sorting layers by lsn.start. +/// +/// These three values are enough to uniquely identify a layer, since +/// a layer is obligated to contain all contents within range, so two +/// deltas (or images) with the same range have identical content. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct LayerKey { + // TODO I use i128 and u64 because it was easy for prototyping, + // testing, and benchmarking. If we can use the Lsn and Key + // types without overhead that would be preferable. + pub key: Range, + pub lsn: Range, + pub is_image: bool, +} + +impl PartialOrd for LayerKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for LayerKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // NOTE we really care about comparing by lsn.start first + self.lsn + .start + .cmp(&other.lsn.start) + .then(self.lsn.end.cmp(&other.lsn.end)) + .then(self.key.start.cmp(&other.key.start)) + .then(self.key.end.cmp(&other.key.end)) + .then(self.is_image.cmp(&other.is_image)) + } +} + +/// Efficiently queryable layer coverage for each LSN. +/// +/// Allows answering layer map queries very efficiently, +/// but doesn't allow retroactive insertion, which is +/// sometimes necessary. See BufferedHistoricLayerCoverage. +pub struct HistoricLayerCoverage { + /// The latest state + head: LayerCoverageTuple, + + /// All previous states + historic: BTreeMap>, +} + +impl Default for HistoricLayerCoverage { + fn default() -> Self { + Self::new() + } +} + +impl HistoricLayerCoverage { + pub fn new() -> Self { + Self { + head: LayerCoverageTuple::default(), + historic: BTreeMap::default(), + } + } + + /// Add a layer + /// + /// Panics if new layer has older lsn.start than an existing layer. + /// See BufferedHistoricLayerCoverage for a more general insertion method. + pub fn insert(&mut self, layer_key: LayerKey, value: Value) { + // It's only a persistent map, not a retroactive one + if let Some(last_entry) = self.historic.iter().next_back() { + let last_lsn = last_entry.0; + if layer_key.lsn.start < *last_lsn { + panic!("unexpected retroactive insert"); + } + } + + // Insert into data structure + if layer_key.is_image { + self.head + .image_coverage + .insert(layer_key.key, layer_key.lsn.clone(), value); + } else { + self.head + .delta_coverage + .insert(layer_key.key, layer_key.lsn.clone(), value); + } + + // Remember history. Clone is O(1) + self.historic.insert(layer_key.lsn.start, self.head.clone()); + } + + /// Query at a particular LSN, inclusive + pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple> { + match self.historic.range(..=lsn).next_back() { + Some((_, v)) => Some(v), + None => None, + } + } + + /// Remove all entries after a certain LSN (inclusive) + pub fn trim(&mut self, begin: &u64) { + self.historic.split_off(begin); + self.head = self + .historic + .iter() + .rev() + .next() + .map(|(_, v)| v.clone()) + .unwrap_or_default(); + } +} + +/// This is the most basic test that demonstrates intended usage. +/// All layers in this test have height 1. +#[test] +fn test_persistent_simple() { + let mut map = HistoricLayerCoverage::::new(); + map.insert( + LayerKey { + key: 0..5, + lsn: 100..101, + is_image: true, + }, + "Layer 1".to_string(), + ); + map.insert( + LayerKey { + key: 3..9, + lsn: 110..111, + is_image: true, + }, + "Layer 2".to_string(), + ); + map.insert( + LayerKey { + key: 5..6, + lsn: 120..121, + is_image: true, + }, + "Layer 3".to_string(), + ); + + // After Layer 1 insertion + let version = map.get_version(105).unwrap(); + assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string())); + assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); + + // After Layer 2 insertion + let version = map.get_version(115).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); + assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string())); + assert_eq!(version.image_coverage.query(11), None); + + // After Layer 3 insertion + let version = map.get_version(125).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); + assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string())); + assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string())); +} + +/// Cover simple off-by-one edge cases +#[test] +fn test_off_by_one() { + let mut map = HistoricLayerCoverage::::new(); + map.insert( + LayerKey { + key: 3..5, + lsn: 100..110, + is_image: true, + }, + "Layer 1".to_string(), + ); + + // Check different LSNs + let version = map.get_version(99); + assert!(version.is_none()); + let version = map.get_version(100).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); + let version = map.get_version(110).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); + + // Check different keys + let version = map.get_version(105).unwrap(); + assert_eq!(version.image_coverage.query(2), None); + assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string())); + assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string())); + assert_eq!(version.image_coverage.query(5), None); +} + +/// Cover edge cases where layers begin or end on the same key +#[test] +fn test_key_collision() { + let mut map = HistoricLayerCoverage::::new(); + + map.insert( + LayerKey { + key: 3..5, + lsn: 100..110, + is_image: true, + }, + "Layer 10".to_string(), + ); + map.insert( + LayerKey { + key: 5..8, + lsn: 100..110, + is_image: true, + }, + "Layer 11".to_string(), + ); + map.insert( + LayerKey { + key: 3..4, + lsn: 200..210, + is_image: true, + }, + "Layer 20".to_string(), + ); + + // Check after layer 11 + let version = map.get_version(105).unwrap(); + assert_eq!(version.image_coverage.query(2), None); + assert_eq!( + version.image_coverage.query(3), + Some("Layer 10".to_string()) + ); + assert_eq!( + version.image_coverage.query(5), + Some("Layer 11".to_string()) + ); + assert_eq!( + version.image_coverage.query(7), + Some("Layer 11".to_string()) + ); + assert_eq!(version.image_coverage.query(8), None); + + // Check after layer 20 + let version = map.get_version(205).unwrap(); + assert_eq!(version.image_coverage.query(2), None); + assert_eq!( + version.image_coverage.query(3), + Some("Layer 20".to_string()) + ); + assert_eq!( + version.image_coverage.query(5), + Some("Layer 11".to_string()) + ); + assert_eq!( + version.image_coverage.query(7), + Some("Layer 11".to_string()) + ); + assert_eq!(version.image_coverage.query(8), None); +} + +/// Test when rectangles have nontrivial height and possibly overlap +#[test] +fn test_persistent_overlapping() { + let mut map = HistoricLayerCoverage::::new(); + + // Add 3 key-disjoint layers with varying LSN ranges + map.insert( + LayerKey { + key: 1..2, + lsn: 100..200, + is_image: true, + }, + "Layer 1".to_string(), + ); + map.insert( + LayerKey { + key: 4..5, + lsn: 110..200, + is_image: true, + }, + "Layer 2".to_string(), + ); + map.insert( + LayerKey { + key: 7..8, + lsn: 120..300, + is_image: true, + }, + "Layer 3".to_string(), + ); + + // Add wide and short layer + map.insert( + LayerKey { + key: 0..9, + lsn: 130..199, + is_image: true, + }, + "Layer 4".to_string(), + ); + + // Add wide layer taller than some + map.insert( + LayerKey { + key: 0..9, + lsn: 140..201, + is_image: true, + }, + "Layer 5".to_string(), + ); + + // Add wide layer taller than all + map.insert( + LayerKey { + key: 0..9, + lsn: 150..301, + is_image: true, + }, + "Layer 6".to_string(), + ); + + // After layer 4 insertion + let version = map.get_version(135).unwrap(); + assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string())); + assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string())); + assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string())); + assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string())); + assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string())); + assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string())); + assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string())); + + // After layer 5 insertion + let version = map.get_version(145).unwrap(); + assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string())); + assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string())); + assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string())); + assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string())); + assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string())); + assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string())); + assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string())); + + // After layer 6 insertion + let version = map.get_version(155).unwrap(); + assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string())); + assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string())); +} + +/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack +/// of support for retroactive insertion by rebuilding the map since the +/// change. +/// +/// Why is this needed? We most often insert new layers with newer LSNs, +/// but during compaction we create layers with non-latest LSN, and during +/// GC we delete historic layers. +/// +/// Even though rebuilding is an expensive (N log N) solution to the problem, +/// it's not critical since we do something equally expensive just to decide +/// whether or not to create new image layers. +/// TODO It's not expensive but it's not great to hold a layer map write lock +/// for that long. +/// +/// If this becomes an actual bottleneck, one solution would be to build a +/// segment tree that holds PersistentLayerMaps. Though this would mean that +/// we take an additional log(N) performance hit for queries, which will probably +/// still be more critical. +/// +/// See this for more on persistent and retroactive techniques: +/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s +pub struct BufferedHistoricLayerCoverage { + /// A persistent layer map that we rebuild when we need to retroactively update + historic_coverage: HistoricLayerCoverage, + + /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds. + buffer: BTreeMap>, + + /// All current layers. This is not used for search. Only to make rebuilds easier. + layers: BTreeMap, +} + +impl std::fmt::Debug for BufferedHistoricLayerCoverage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RetroactiveLayerMap") + .field("buffer", &self.buffer) + .field("layers", &self.layers) + .finish() + } +} + +impl Default for BufferedHistoricLayerCoverage { + fn default() -> Self { + Self::new() + } +} + +impl BufferedHistoricLayerCoverage { + pub fn new() -> Self { + Self { + historic_coverage: HistoricLayerCoverage::::new(), + buffer: BTreeMap::new(), + layers: BTreeMap::new(), + } + } + + pub fn insert(&mut self, layer_key: LayerKey, value: Value) { + self.buffer.insert(layer_key, Some(value)); + } + + pub fn remove(&mut self, layer_key: LayerKey) { + self.buffer.insert(layer_key, None); + } + + pub fn rebuild(&mut self) { + // Find the first LSN that needs to be rebuilt + let rebuild_since: u64 = match self.buffer.iter().next() { + Some((LayerKey { lsn, .. }, _)) => lsn.start, + None => return, // No need to rebuild if buffer is empty + }; + + // Apply buffered updates to self.layers + let num_updates = self.buffer.len(); + self.buffer.retain(|layer_key, layer| { + match layer { + Some(l) => { + self.layers.insert(layer_key.clone(), l.clone()); + } + None => { + self.layers.remove(layer_key); + } + }; + false + }); + + // Rebuild + let mut num_inserted = 0; + self.historic_coverage.trim(&rebuild_since); + for (layer_key, layer) in self.layers.range( + LayerKey { + lsn: rebuild_since..0, + key: 0..0, + is_image: false, + }.., + ) { + self.historic_coverage + .insert(layer_key.clone(), layer.clone()); + num_inserted += 1; + } + + // TODO maybe only warn if ratio is at least 10 + info!( + "Rebuilt layer map. Did {} insertions to process a batch of {} updates.", + num_inserted, num_updates, + ) + } + + /// Iterate all the layers + pub fn iter(&self) -> impl '_ + Iterator { + // NOTE we can actually perform this without rebuilding, + // but it's not necessary for now. + if !self.buffer.is_empty() { + panic!("rebuild pls") + } + + self.layers.values().cloned() + } + + /// Return a reference to a queryable map, assuming all updates + /// have already been processed using self.rebuild() + pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage> { + // NOTE we error here instead of implicitly rebuilding because + // rebuilding is somewhat expensive. + // TODO maybe implicitly rebuild and log/sentry an error? + if !self.buffer.is_empty() { + anyhow::bail!("rebuild required") + } + + Ok(&self.historic_coverage) + } +} + +#[test] +fn test_retroactive_regression_1() { + let mut map = BufferedHistoricLayerCoverage::new(); + + map.insert( + LayerKey { + key: 0..21267647932558653966460912964485513215, + lsn: 23761336..23761457, + is_image: false, + }, + "sdfsdfs".to_string(), + ); + + map.rebuild(); + + let version = map.get().unwrap().get_version(23761457).unwrap(); + assert_eq!( + version.delta_coverage.query(100), + Some("sdfsdfs".to_string()) + ); +} + +#[test] +fn test_retroactive_simple() { + let mut map = BufferedHistoricLayerCoverage::new(); + + // Append some images in increasing LSN order + map.insert( + LayerKey { + key: 0..5, + lsn: 100..101, + is_image: true, + }, + "Image 1".to_string(), + ); + map.insert( + LayerKey { + key: 3..9, + lsn: 110..111, + is_image: true, + }, + "Image 2".to_string(), + ); + map.insert( + LayerKey { + key: 4..6, + lsn: 120..121, + is_image: true, + }, + "Image 3".to_string(), + ); + map.insert( + LayerKey { + key: 8..9, + lsn: 120..121, + is_image: true, + }, + "Image 4".to_string(), + ); + + // Add a delta layer out of order + map.insert( + LayerKey { + key: 2..5, + lsn: 105..106, + is_image: true, + }, + "Delta 1".to_string(), + ); + + // Rebuild so we can start querying + map.rebuild(); + + // Query key 4 + let version = map.get().unwrap().get_version(90); + assert!(version.is_none()); + let version = map.get().unwrap().get_version(102).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string())); + let version = map.get().unwrap().get_version(107).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string())); + let version = map.get().unwrap().get_version(115).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string())); + let version = map.get().unwrap().get_version(125).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string())); + + // Remove Image 3 + map.remove(LayerKey { + key: 4..6, + lsn: 120..121, + is_image: true, + }); + map.rebuild(); + + // Check deletion worked + let version = map.get().unwrap().get_version(125).unwrap(); + assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string())); + assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string())); +} diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs new file mode 100644 index 0000000000..4e3b4516dc --- /dev/null +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -0,0 +1,154 @@ +use std::ops::Range; + +// TODO the `im` crate has 20x more downloads and also has +// persistent/immutable BTree. It also runs a bit faster but +// results are not the same on some tests. +use rpds::RedBlackTreeMapSync; + +/// Data structure that can efficiently: +/// - find the latest layer by lsn.end at a given key +/// - iterate the latest layers in a key range +/// - insert layers in non-decreasing lsn.start order +/// +/// The struct is parameterized over Value for easier +/// testing, but in practice it's some sort of layer. +pub struct LayerCoverage { + /// For every change in coverage (as we sweep the key space) + /// we store (lsn.end, value). + /// + /// We use an immutable/persistent tree so that we can keep historic + /// versions of this coverage without cloning the whole thing and + /// incurring quadratic memory cost. See HistoricLayerCoverage. + /// + /// We use the Sync version of the map because we want Self to + /// be Sync. Using nonsync might be faster, if we can work with + /// that. + nodes: RedBlackTreeMapSync>, +} + +impl Default for LayerCoverage { + fn default() -> Self { + Self::new() + } +} + +impl LayerCoverage { + pub fn new() -> Self { + Self { + nodes: RedBlackTreeMapSync::default(), + } + } + + /// Helper function to subdivide the key range without changing any values + /// + /// Complexity: O(log N) + fn add_node(&mut self, key: i128) { + let value = match self.nodes.range(..=key).last() { + Some((_, Some(v))) => Some(v.clone()), + Some((_, None)) => None, + None => None, + }; + self.nodes.insert_mut(key, value); + } + + /// Insert a layer. + /// + /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation. + pub fn insert(&mut self, key: Range, lsn: Range, value: Value) { + // Add nodes at endpoints + // + // NOTE The order of lines is important. We add nodes at the start + // and end of the key range **before updating any nodes** in order + // to pin down the current coverage outside of the relevant key range. + // Only the coverage inside the layer's key range should change. + self.add_node(key.start); + self.add_node(key.end); + + // Raise the height where necessary + // + // NOTE This loop is worst case O(N), but amortized O(log N) in the special + // case when rectangles have no height. In practice I don't think we'll see + // the kind of layer intersections needed to trigger O(N) behavior. The worst + // case is N/2 horizontal layers overlapped with N/2 vertical layers in a + // grid pattern. + let mut to_update = Vec::new(); + let mut to_remove = Vec::new(); + let mut prev_covered = false; + for (k, node) in self.nodes.range(key.clone()) { + let needs_cover = match node { + None => true, + Some((h, _)) => h < &lsn.end, + }; + if needs_cover { + match prev_covered { + true => to_remove.push(*k), + false => to_update.push(*k), + } + } + prev_covered = needs_cover; + } + if !prev_covered { + to_remove.push(key.end); + } + for k in to_update { + self.nodes.insert_mut(k, Some((lsn.end, value.clone()))); + } + for k in to_remove { + self.nodes.remove_mut(&k); + } + } + + /// Get the latest (by lsn.end) layer at a given key + /// + /// Complexity: O(log N) + pub fn query(&self, key: i128) -> Option { + self.nodes + .range(..=key) + .rev() + .next()? + .1 + .as_ref() + .map(|(_, v)| v.clone()) + } + + /// Iterate the changes in layer coverage in a given range. You will likely + /// want to start with self.query(key.start), and then follow up with self.range + /// + /// Complexity: O(log N + result_size) + pub fn range(&self, key: Range) -> impl '_ + Iterator)> { + self.nodes + .range(key) + .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone()))) + } + + /// O(1) clone + pub fn clone(&self) -> Self { + Self { + nodes: self.nodes.clone(), + } + } +} + +/// Image and delta coverage at a specific LSN. +pub struct LayerCoverageTuple { + pub image_coverage: LayerCoverage, + pub delta_coverage: LayerCoverage, +} + +impl Default for LayerCoverageTuple { + fn default() -> Self { + Self { + image_coverage: LayerCoverage::default(), + delta_coverage: LayerCoverage::default(), + } + } +} + +impl LayerCoverageTuple { + pub fn clone(&self) -> Self { + Self { + image_coverage: self.image_coverage.clone(), + delta_coverage: self.delta_coverage.clone(), + } + } +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index dce7cd8bae..a9edee3794 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -16,6 +16,7 @@ use remote_storage::GenericRemoteStorage; use utils::crashsafe; use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::{Tenant, TenantState}; @@ -24,8 +25,35 @@ use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; -static TENANTS: Lazy>>> = - Lazy::new(|| RwLock::new(HashMap::new())); +/// The tenants known to the pageserver. +/// The enum variants are used to distinguish the different states that the pageserver can be in. +enum TenantsMap { + /// [`init_tenant_mgr`] is not done yet. + Initializing, + /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. + /// New tenants can be added using [`tenant_map_insert`]. + Open(HashMap>), + /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// Existing tenants are still accessible, but no new tenants can be created. + ShuttingDown(HashMap>), +} + +impl TenantsMap { + fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + match self { + TenantsMap::Initializing => None, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id), + } + } + fn remove(&mut self, tenant_id: &TenantId) -> Option> { + match self { + TenantsMap::Initializing => None, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id), + } + } +} + +static TENANTS: Lazy> = Lazy::new(|| RwLock::new(TenantsMap::Initializing)); /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) @@ -36,13 +64,16 @@ pub async fn init_tenant_mgr( remote_storage: Option, ) -> anyhow::Result<()> { // Scan local filesystem for attached tenants - let mut number_of_tenants = 0; let tenants_dir = conf.tenants_path(); + let mut tenants = HashMap::new(); + let mut dir_entries = fs::read_dir(&tenants_dir) .await .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + loop { match dir_entries.next_entry().await { Ok(None) => break, @@ -86,10 +117,10 @@ pub async fn init_tenant_mgr( conf, &tenant_dir_path, remote_storage.clone(), + &ctx, ) { Ok(tenant) => { - TENANTS.write().await.insert(tenant.tenant_id(), tenant); - number_of_tenants += 1; + tenants.insert(tenant.tenant_id(), tenant); } Err(e) => { error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"); @@ -108,7 +139,11 @@ pub async fn init_tenant_mgr( } } - info!("Processed {number_of_tenants} local tenants at startup"); + info!("Processed {} local tenants at startup", tenants.len()); + + let mut tenants_map = TENANTS.write().await; + assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); + *tenants_map = TenantsMap::Open(tenants); Ok(()) } @@ -116,6 +151,7 @@ pub fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_path: &Path, remote_storage: Option, + ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( tenant_path.is_dir(), @@ -150,7 +186,7 @@ pub fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); if let Some(remote_storage) = remote_storage { - Tenant::spawn_attach(conf, tenant_id, remote_storage) + Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) } else { warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); Tenant::create_broken_tenant(conf, tenant_id) @@ -158,7 +194,7 @@ pub fn schedule_local_tenant_processing( } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. - Tenant::spawn_load(conf, tenant_id, remote_storage) + Tenant::spawn_load(conf, tenant_id, remote_storage, ctx) }; Ok(tenant) } @@ -166,21 +202,44 @@ pub fn schedule_local_tenant_processing( /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// +/// NB: We leave the tenants in the map, so that they remain accessible through +/// the management API until we shut it down. If we removed the shut-down tenants +/// from the tenants map, the management API would return 404 for these tenants, +/// because TenantsMap::get() now returns `None`. +/// That could be easily misinterpreted by control plane, the consumer of the +/// management API. For example, it could attach the tenant on a different pageserver. +/// We would then be in split-brain once this pageserver restarts. pub async fn shutdown_all_tenants() { + // Prevent new tenants from being created. let tenants_to_shut_down = { let mut m = TENANTS.write().await; - let mut tenants_to_shut_down = Vec::with_capacity(m.len()); - for (_, tenant) in m.drain() { - if tenant.is_active() { - // updates tenant state, forbidding new GC and compaction iterations from starting - tenant.set_stopping(); - tenants_to_shut_down.push(tenant) + match &mut *m { + TenantsMap::Initializing => { + *m = TenantsMap::ShuttingDown(HashMap::default()); + info!("tenants map is empty"); + return; + } + TenantsMap::Open(tenants) => { + let tenants_clone = tenants.clone(); + *m = TenantsMap::ShuttingDown(std::mem::take(tenants)); + tenants_clone + } + TenantsMap::ShuttingDown(_) => { + error!("already shutting down, this function isn't supposed to be called more than once"); + return; } } - drop(m); - tenants_to_shut_down }; + let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len()); + for (_, tenant) in tenants_to_shut_down { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_stopping(); + tenants_to_freeze_and_flush.push(tenant); + } + } + // Shut down all existing walreceiver connections and stop accepting the new ones. task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; @@ -192,7 +251,7 @@ pub async fn shutdown_all_tenants() { // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenant in tenants_to_shut_down { + for tenant in tenants_to_freeze_and_flush { let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); @@ -207,27 +266,23 @@ pub async fn create_tenant( tenant_conf: TenantConfOpt, tenant_id: TenantId, remote_storage: Option, -) -> anyhow::Result>> { - match TENANTS.write().await.entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - debug!("tenant {tenant_id} already exists"); - Ok(None) - } - hash_map::Entry::Vacant(v) => { - // Hold the write_tenants() lock, since all of this is local IO. - // If this section ever becomes contentious, introduce a new `TenantState::Creating`. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; - let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?; - let crated_tenant_id = created_tenant.tenant_id(); - anyhow::ensure!( + ctx: &RequestContext, +) -> Result, TenantMapInsertError> { + tenant_map_insert(tenant_id, |vacant_entry| { + // We're holding the tenants lock in write mode while doing local IO. + // If this section ever becomes contentious, introduce a new `TenantState::Creating` + // and do the work in that state. + let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; + let created_tenant = + schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?; + let crated_tenant_id = created_tenant.tenant_id(); + anyhow::ensure!( tenant_id == crated_tenant_id, "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})", ); - v.insert(Arc::clone(&created_tenant)); - Ok(Some(created_tenant)) - } - } + vacant_entry.insert(Arc::clone(&created_tenant)); + Ok(created_tenant) + }).await } pub async fn update_tenant_config( @@ -236,10 +291,11 @@ pub async fn update_tenant_config( tenant_id: TenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - get_tenant(tenant_id, true) - .await? - .update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; + let tenant = get_tenant(tenant_id, true).await?; + + tenant.update_tenant_config(tenant_conf); + let tenant_config_path = conf.tenant_config_path(tenant_id); + Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?; Ok(()) } @@ -260,10 +316,14 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul } } -pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { +pub async fn delete_timeline( + tenant_id: TenantId, + timeline_id: TimelineId, + ctx: &RequestContext, +) -> anyhow::Result<()> { match get_tenant(tenant_id, true).await { Ok(tenant) => { - tenant.delete_timeline(timeline_id).await?; + tenant.delete_timeline(timeline_id, ctx).await?; } Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), } @@ -291,8 +351,9 @@ pub async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: Option, -) -> anyhow::Result<()> { - run_if_no_tenant_in_memory(tenant_id, |vacant_entry| { + ctx: &RequestContext, +) -> Result<(), TenantMapInsertError> { + tenant_map_insert(tenant_id, |vacant_entry| { let tenant_path = conf.tenant_path(&tenant_id); let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); if tenant_ignore_mark.exists() { @@ -300,7 +361,7 @@ pub async fn load_tenant( .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; } - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage) + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -329,16 +390,24 @@ pub async fn ignore_tenant( .await } +#[derive(Debug, thiserror::Error)] +pub enum TenantMapListError { + #[error("tenant map is still initiailizing")] + Initializing, +} + /// /// Get list of tenants, for the mgmt API /// -pub async fn list_tenants() -> Vec<(TenantId, TenantState)> { - TENANTS - .read() - .await - .iter() +pub async fn list_tenants() -> Result, TenantMapListError> { + let tenants = TENANTS.read().await; + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + Ok(m.iter() .map(|(id, tenant)| (*id, tenant.current_state())) - .collect() + .collect()) } /// Execute Attach mgmt API command. @@ -349,34 +418,62 @@ pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: GenericRemoteStorage, -) -> anyhow::Result<()> { - run_if_no_tenant_in_memory(tenant_id, |vacant_entry| { + ctx: &RequestContext, +) -> Result<(), TenantMapInsertError> { + tenant_map_insert(tenant_id, |vacant_entry| { let tenant_path = conf.tenant_path(&tenant_id); anyhow::ensure!( !tenant_path.exists(), "Cannot attach tenant {tenant_id}, local tenant directory already exists" ); - let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage); + let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx); vacant_entry.insert(tenant); - Ok(()) }) .await } -async fn run_if_no_tenant_in_memory(tenant_id: TenantId, run: F) -> anyhow::Result +#[derive(Debug, thiserror::Error)] +pub enum TenantMapInsertError { + #[error("tenant map is still initializing")] + StillInitializing, + #[error("tenant map is shutting down")] + ShuttingDown, + #[error("tenant {0} already exists, state: {1:?}")] + TenantAlreadyExists(TenantId, TenantState), + #[error(transparent)] + Closure(#[from] anyhow::Error), +} + +/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that +/// entry is vacant. The closure is responsible for creating the tenant object and inserting +/// it into the tenants map through the vacnt entry that it receives as argument. +/// +/// NB: the closure should return quickly because the current implementation of tenants map +/// serializes access through an `RwLock`. +async fn tenant_map_insert( + tenant_id: TenantId, + insert_fn: F, +) -> Result where F: FnOnce(hash_map::VacantEntry>) -> anyhow::Result, { - match TENANTS.write().await.entry(tenant_id) { - hash_map::Entry::Occupied(e) => { - anyhow::bail!( - "tenant {tenant_id} already exists, state: {:?}", - e.get().current_state() - ) - } - hash_map::Entry::Vacant(v) => run(v), + let mut guard = TENANTS.write().await; + let m = match &mut *guard { + TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing), + TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown), + TenantsMap::Open(m) => m, + }; + match m.entry(tenant_id) { + hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists( + tenant_id, + e.get().current_state(), + )), + hash_map::Entry::Vacant(v) => match insert_fn(v) { + Ok(v) => Ok(v), + Err(e) => Err(TenantMapInsertError::Closure(e)), + }, } } @@ -449,9 +546,9 @@ pub async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, gc_req: TimelineGcRequest, + ctx: &RequestContext, ) -> Result>, ApiError> { let guard = TENANTS.read().await; - let tenant = guard .get(&tenant_id) .map(Arc::clone) @@ -462,7 +559,8 @@ pub async fn immediate_gc( // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); - // Run in task_mgr to avoid race with detach operation + // Run in task_mgr to avoid race with tenant_detach operation + let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -474,7 +572,7 @@ pub async fn immediate_gc( async move { fail::fail_point!("immediate_gc_task_pre"); let result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr) + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx) .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it @@ -497,6 +595,7 @@ pub async fn immediate_gc( pub async fn immediate_compact( tenant_id: TenantId, timeline_id: TimelineId, + ctx: &RequestContext, ) -> Result>, ApiError> { let guard = TENANTS.read().await; @@ -510,7 +609,8 @@ pub async fn immediate_compact( .get_timeline(timeline_id, true) .map_err(ApiError::NotFound)?; - // Run in task_mgr to avoid race with detach operation + // Run in task_mgr to avoid race with tenant_detach operation + let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -523,7 +623,7 @@ pub async fn immediate_compact( false, async move { let result = timeline - .compact() + .compact(&ctx) .instrument( info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id), ) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 013591caee..3f69017160 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1010,7 +1010,10 @@ impl RemoteTimelineClient { #[cfg(test)] mod tests { use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::{ + tenant::harness::{TenantHarness, TIMELINE_ID}, + DEFAULT_PG_VERSION, + }; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use std::{collections::HashSet, path::Path}; use utils::lsn::Lsn; @@ -1064,9 +1067,19 @@ mod tests { // Test scheduling #[test] fn upload_scheduling() -> anyhow::Result<()> { + // Use a current-thread runtime in the test + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?, + )); + let _entered = runtime.enter(); + let harness = TenantHarness::create("upload_scheduling")?; + let (tenant, ctx) = runtime.block_on(harness.load()); + let _timeline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let timeline_path = harness.timeline_path(&TIMELINE_ID); - std::fs::create_dir_all(&timeline_path)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs"); std::fs::create_dir_all(remote_fs_dir)?; @@ -1084,14 +1097,6 @@ mod tests { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), }; - // Use a current-thread runtime in the test - let runtime = Box::leak(Box::new( - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?, - )); - let _entered = runtime.enter(); - // Test outline: // // Schedule upload of a bunch of layers. Check that they are started immediately, not queued diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 61cb32fc76..2fed4f88b3 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -6,6 +6,7 @@ use anyhow::Context; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; +use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::Tenant; @@ -181,6 +182,7 @@ pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + ctx: &RequestContext, ) -> anyhow::Result { // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to // our advantage with `?` error handling. @@ -188,7 +190,7 @@ pub(super) async fn gather_inputs( // refresh is needed to update gc related pitr_cutoff and horizon_cutoff tenant - .refresh_gc_info() + .refresh_gc_info(ctx) .await .context("Failed to refresh gc_info before gathering inputs")?; @@ -329,7 +331,13 @@ pub(super) async fn gather_inputs( } else { let timeline = Arc::clone(&timeline); let parallel_size_calcs = Arc::clone(limit); - joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn)); + let ctx = ctx.attached_child(); + joinset.spawn(calculate_logical_size( + parallel_size_calcs, + timeline, + *lsn, + ctx, + )); } } @@ -387,6 +395,7 @@ pub(super) async fn gather_inputs( parallel_size_calcs, timeline.clone(), lsn, + ctx.attached_child(), )); if let Some(parent_id) = timeline.get_ancestor_timeline_id() { @@ -582,13 +591,14 @@ async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, + ctx: RequestContext, ) -> Result { let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); let size_res = timeline - .spawn_ondemand_logical_size_calculation(lsn) + .spawn_ondemand_logical_size_calculation(lsn, ctx) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 6aee8ce23c..2149fc7eb7 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -196,3 +196,50 @@ pub fn downcast_remote_layer( None } } + +impl std::fmt::Debug for dyn Layer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Layer") + .field("short_id", &self.short_id()) + .finish() + } +} + +/// Holds metadata about a layer without any content. Used mostly for testing. +pub struct LayerDescriptor { + pub key: Range, + pub lsn: Range, + pub is_incremental: bool, + pub short_id: String, +} + +impl Layer for LayerDescriptor { + fn get_key_range(&self) -> Range { + self.key.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn.clone() + } + + fn is_incremental(&self) -> bool { + self.is_incremental + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + ) -> Result { + todo!("This method shouldn't be part of the Layer trait") + } + + fn short_id(&self) -> String { + self.short_id.clone() + } + + fn dump(&self, _verbose: bool) -> Result<()> { + todo!() + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b7ad8fe791..b126545ee4 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -5,6 +5,7 @@ use std::ops::ControlFlow; use std::sync::Arc; use std::time::Duration; +use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; @@ -52,19 +53,20 @@ async fn compaction_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); loop { trace!("waking up"); let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { info!("received cancellation request"); - return; + return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { ControlFlow::Break(()) => return, ControlFlow::Continue(tenant) => tenant, }, - }; + }; let mut sleep_duration = tenant.get_compaction_period(); if sleep_duration == Duration::ZERO { @@ -73,7 +75,7 @@ async fn compaction_loop(tenant_id: TenantId) { sleep_duration = Duration::from_secs(10); } else { // Run compaction - if let Err(e) = tenant.compaction_iteration().await { + if let Err(e) = tenant.compaction_iteration(&ctx).await { sleep_duration = wait_duration; error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); } @@ -103,6 +105,9 @@ async fn gc_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + // GC might require downloading, to find the cutoff LSN that corresponds to the + // cutoff specified as time. + let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); loop { trace!("waking up"); @@ -127,7 +132,7 @@ async fn gc_loop(tenant_id: TenantId) { } else { // Run gc if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await { sleep_duration = wait_duration; error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d59858f582..0ca8a0e491 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,7 @@ //! +mod walreceiver; + use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; @@ -13,6 +15,7 @@ use pageserver_api::models::{ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; use std::collections::HashMap; @@ -23,6 +26,8 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use crate::broker_client::is_broker_client_initialized; +use crate::context::{DownloadBehavior, RequestContext}; use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; use crate::tenant::storage_layer::{ DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName, @@ -58,11 +63,11 @@ use crate::page_cache; use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::task_mgr::TaskKind; -use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; +use walreceiver::spawn_connection_manager_task; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; @@ -128,7 +133,6 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, - // Metrics metrics: TimelineMetrics, /// Ensures layers aren't frozen by checkpointer between @@ -377,6 +381,12 @@ pub enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + /// The operation would require downloading a layer that is missing locally. + NeedsDownload(TenantTimelineId, LayerFileName), + + /// The operation was cancelled + Cancelled, + /// An error happened replaying WAL records #[error(transparent)] WalRedo(#[from] crate::walredo::WalRedoError), @@ -386,6 +396,33 @@ impl std::fmt::Debug for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), + Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { + write!( + f, + "layer {}/{} needs download", + tenant_timeline_id, + layer_file_name.file_name() + ) + } + Self::Cancelled => write!(f, "cancelled"), + Self::WalRedo(err) => err.fmt(f), + } + } +} + +impl std::fmt::Display for PageReconstructError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Self::Other(err) => err.fmt(f), + Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { + write!( + f, + "layer {}/{} needs download", + tenant_timeline_id, + layer_file_name.file_name() + ) + } + Self::Cancelled => write!(f, "cancelled"), Self::WalRedo(err) => err.fmt(f), } } @@ -422,11 +459,24 @@ impl Timeline { /// an ancestor branch, for example, or waste a lot of cycles chasing the /// non-existing key. /// - pub async fn get(&self, key: Key, lsn: Lsn) -> Result { + pub async fn get( + &self, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + // XXX: structured stats collection for layer eviction here. + trace!( + "get page request for {}@{} from task kind {:?}", + key, + lsn, + ctx.task_kind() + ); + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -450,7 +500,7 @@ impl Timeline { img: cached_page_img, }; - self.get_reconstruct_data(key, lsn, &mut reconstruct_state) + self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; self.metrics @@ -513,13 +563,25 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + pub async fn wait_lsn( + &self, + lsn: Lsn, + _ctx: &RequestContext, /* Prepare for use by cancellation */ + ) -> anyhow::Result<()> { anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline"); // This should never be called from the WAL receiver, because that could lead // to a deadlock. anyhow::ensure!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), + "wait_lsn cannot be called in WAL receiver" + ); + anyhow::ensure!( + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), + "wait_lsn cannot be called in WAL receiver" + ); + anyhow::ensure!( + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), "wait_lsn cannot be called in WAL receiver" ); @@ -558,7 +620,7 @@ impl Timeline { self.flush_frozen_layers_and_wait().await } - pub async fn compact(&self) -> anyhow::Result<()> { + pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created @@ -616,14 +678,16 @@ impl Timeline { .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), + ctx, ) .await { Ok((partitioning, lsn)) => { // 2. Create new image layers for partitions that have been modified // "enough". - let layer_paths_to_upload = - self.create_image_layers(&partitioning, lsn, false).await?; + let layer_paths_to_upload = self + .create_image_layers(&partitioning, lsn, false, ctx) + .await?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -673,7 +737,10 @@ impl Timeline { /// the initial size calculation has not been run (gets triggered on the first size access). /// /// return size and boolean flag that shows if the size is exact - pub fn get_current_logical_size(self: &Arc) -> anyhow::Result<(u64, bool)> { + pub fn get_current_logical_size( + self: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<(u64, bool)> { let current_size = self.current_logical_size.current_size()?; debug!("Current size: {current_size:?}"); @@ -683,7 +750,7 @@ impl Timeline { (current_size, self.current_logical_size.initial_part_end) { is_exact = false; - self.try_spawn_size_init_task(init_lsn); + self.try_spawn_size_init_task(init_lsn, ctx); } Ok((size, is_exact)) @@ -729,16 +796,24 @@ impl Timeline { Ok(()) } + pub fn activate(self: &Arc) { + self.set_state(TimelineState::Active); + self.launch_wal_receiver(); + } + pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { - debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (st, TimelineState::Loading) => { + error!("ignoring transition from {st:?} into Loading state"); } (TimelineState::Broken, _) => { error!("Ignoring state update {new_state:?} for broken tenant"); } (TimelineState::Stopping, TimelineState::Active) => { - debug!("Not activating a Stopping timeline"); + error!("Not activating a Stopping timeline"); } (_, new_state) => { self.state.send_replace(new_state); @@ -812,7 +887,7 @@ impl Timeline { pg_version: u32, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); - let (state, _) = watch::channel(TimelineState::Suspended); + let (state, _) = watch::channel(TimelineState::Loading); let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); @@ -884,6 +959,10 @@ impl Timeline { }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result + .metrics + .last_record_gauge + .set(disk_consistent_lsn.0 as i64); + result }) } @@ -909,22 +988,25 @@ impl Timeline { let layer_flush_start_rx = self.layer_flush_start_tx.subscribe(); let self_clone = Arc::clone(self); + info!("spawning flush loop"); task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_id), - Some(self.timeline_id), - "layer flush task", - false, - async move { - self_clone.flush_loop(layer_flush_start_rx).await; - let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert_eq!(*flush_loop_state, FlushLoopState::Running); - *flush_loop_state = FlushLoopState::Exited; - Ok(()) } - .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) - ); + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush task", + false, + async move { + let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); + self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; + let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); + assert_eq!(*flush_loop_state, FlushLoopState::Running); + *flush_loop_state = FlushLoopState::Exited; + Ok(()) + } + .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); *flush_loop_state = FlushLoopState::Running; } @@ -955,12 +1037,16 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard); let self_clone = Arc::clone(self); + let background_ctx = + // XXX: this is a detached_child. Plumb through the ctx from call sites. + RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); spawn_connection_manager_task( self_clone, walreceiver_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), + background_ctx, ); } @@ -970,6 +1056,7 @@ impl Timeline { /// pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { let mut layers = self.layers.write().unwrap(); + let mut updates = layers.batch_update(); let mut num_layers = 0; let timer = self.metrics.load_layer_map_histo.start_timer(); @@ -1010,7 +1097,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - layers.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1041,7 +1128,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - layers.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1067,6 +1154,7 @@ impl Timeline { } } + updates.flush(); layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); info!( @@ -1091,6 +1179,11 @@ impl Timeline { // Are we missing some files that are present in remote storage? // Create RemoteLayer instances for them. let mut local_only_layers = local_layers; + + // We're holding a layer map lock for a while but this + // method is only called during init so it's fine. + let mut layer_map = self.layers.write().unwrap(); + let mut updates = layer_map.batch_update(); for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1129,7 +1222,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - self.layers.write().unwrap().remove_historic(local_layer); + updates.remove_historic(local_layer); // fall-through to adding the remote layer } } else { @@ -1171,7 +1264,7 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - self.layers.write().unwrap().insert_historic(remote_layer); + updates.insert_historic(remote_layer); } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1194,13 +1287,14 @@ impl Timeline { &remote_layer_metadata, ); let remote_layer = Arc::new(remote_layer); - self.layers.write().unwrap().insert_historic(remote_layer); + updates.insert_historic(remote_layer); } #[cfg(test)] LayerFileName::Test(_) => unreachable!(), } } + updates.flush(); Ok(local_only_layers) } @@ -1280,7 +1374,7 @@ impl Timeline { Ok(()) } - fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { + fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn, ctx: &RequestContext) { let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) .try_acquire_owned() { @@ -1296,8 +1390,18 @@ impl Timeline { .initial_logical_size .get() .is_none()); + + info!( + "spawning logical size computation from context of task kind {:?}", + ctx.task_kind() + ); // We need to start the computation task. + // It gets a separate context since it will outlive the request that called this function. let self_clone = Arc::clone(self); + let background_ctx = ctx.detached_child( + TaskKind::InitialLogicalSizeCalculation, + DownloadBehavior::Download, + ); task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, @@ -1307,7 +1411,9 @@ impl Timeline { false, // NB: don't log errors here, task_mgr will do that. async move { - let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await + let calculated_size = match self_clone + .logical_size_calculation_task(init_lsn, &background_ctx) + .await { Ok(s) => s, Err(CalculateLogicalSizeError::Cancelled) => { @@ -1342,18 +1448,27 @@ impl Timeline { pub fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, + ctx: RequestContext, ) -> oneshot::Receiver> { let (sender, receiver) = oneshot::channel(); let self_clone = Arc::clone(self); + // XXX if our caller loses interest, i.e., ctx is cancelled, + // we should stop the size calculation work and return an error. + // That would require restructuring this function's API to + // return the result directly, instead of a Receiver for the result. + let ctx = ctx.detached_child( + TaskKind::OndemandLogicalSizeCalculation, + DownloadBehavior::Download, + ); task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::InitialLogicalSizeCalculation, + task_mgr::TaskKind::OndemandLogicalSizeCalculation, Some(self.tenant_id), Some(self.timeline_id), "ondemand logical size calculation", false, async move { - let res = self_clone.logical_size_calculation_task(lsn).await; + let res = self_clone.logical_size_calculation_task(lsn, &ctx).await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors }, @@ -1365,6 +1480,7 @@ impl Timeline { async fn logical_size_calculation_task( self: &Arc, init_lsn: Lsn, + ctx: &RequestContext, ) -> Result { let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); @@ -1372,12 +1488,13 @@ impl Timeline { let calculation = async { let cancel = cancel.child_token(); + let ctx = ctx.attached_child(); tokio::task::spawn_blocking(move || { // Run in a separate thread since this can do a lot of // synchronous file IO without .await inbetween // if there are no RemoteLayers that would require downloading. let h = tokio::runtime::Handle::current(); - h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel)) + h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx)) }) .await .context("Failed to spawn calculation result task")? @@ -1392,7 +1509,7 @@ impl Timeline { TimelineState::Active => continue, TimelineState::Broken | TimelineState::Stopping - | TimelineState::Suspended => { + | TimelineState::Loading => { break format!("aborted because timeline became inactive (new state: {new_state:?})") } } @@ -1432,10 +1549,11 @@ impl Timeline { /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. - async fn calculate_logical_size( + pub async fn calculate_logical_size( &self, up_to_lsn: Lsn, cancel: CancellationToken, + ctx: &RequestContext, ) -> Result { info!( "Calculating logical size for timeline {} at {}", @@ -1478,7 +1596,7 @@ impl Timeline { self.metrics.logical_size_histo.start_timer() }; let logical_size = self - .get_current_logical_size_non_incremental(up_to_lsn, cancel) + .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); @@ -1555,6 +1673,7 @@ impl Timeline { key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState, + ctx: &RequestContext, ) -> Result<(), PageReconstructError> { // Start from the current timeline. let mut timeline_owned; @@ -1742,14 +1861,43 @@ impl Timeline { let remote_layer_as_persistent: Arc = Arc::clone(&remote_layer) as Arc; let id = remote_layer_as_persistent.traversal_id(); - info!("need remote layer {id}"); + info!( + "need remote layer {} for task kind {:?}", + id, + ctx.task_kind() + ); // The next layer doesn't exist locally. Need to download it. // (The control flow is a bit complicated here because we must drop the 'layers' // lock before awaiting on the Future.) - info!("on-demand downloading remote layer {id}"); - timeline.download_remote_layer(remote_layer).await?; - continue 'layer_map_search; + match ( + ctx.download_behavior(), + self.conf.ondemand_download_behavior_treat_error_as_warn, + ) { + (DownloadBehavior::Download, _) => { + info!( + "on-demand downloading remote layer {id} for task kind {:?}", + ctx.task_kind() + ); + timeline.download_remote_layer(remote_layer).await?; + continue 'layer_map_search; + } + (DownloadBehavior::Warn, _) | (DownloadBehavior::Error, true) => { + warn!( + "unexpectedly on-demand downloading remote layer {} for task kind {:?}", + id, + ctx.task_kind() + ); + timeline.download_remote_layer(remote_layer).await?; + continue 'layer_map_search; + } + (DownloadBehavior::Error, false) => { + return Err(PageReconstructError::NeedsDownload( + TenantTimelineId::new(self.tenant_id, self.timeline_id), + remote_layer.file_name.clone(), + )) + } + } } } } @@ -1871,7 +2019,11 @@ impl Timeline { } /// Layer flusher task's main loop. - async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver) { + async fn flush_loop( + &self, + mut layer_flush_start_rx: tokio::sync::watch::Receiver, + ctx: &RequestContext, + ) { info!("started flush loop"); loop { tokio::select! { @@ -1892,7 +2044,7 @@ impl Timeline { // drop 'layers' lock to allow concurrent reads and writes }; if let Some(layer_to_flush) = layer_to_flush { - if let Err(err) = self.flush_frozen_layer(layer_to_flush).await { + if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await { error!("could not flush frozen layer: {err:?}"); break Err(err); } @@ -1957,8 +2109,12 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] - async fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { + #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] + async fn flush_frozen_layer( + &self, + frozen_layer: Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -1966,10 +2122,12 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not + // require downloading anything during initial import. let (partitioning, _lsn) = self - .repartition(self.initdb_lsn, self.get_compaction_target_size()) + .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx) .await?; - self.create_image_layers(&partitioning, self.initdb_lsn, true) + self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) .await? } else { // normal case, write out a L0 delta layer file. @@ -2099,10 +2257,11 @@ impl Timeline { ])?; // Add it to the layer map - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(new_delta)); - } + self.layers + .write() + .unwrap() + .batch_update() + .insert_historic(Arc::new(new_delta)); // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2119,6 +2278,7 @@ impl Timeline { &self, lsn: Lsn, partition_size: u64, + ctx: &RequestContext, ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { let partitioning_guard = self.partitioning.lock().unwrap(); @@ -2129,7 +2289,7 @@ impl Timeline { return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); } } - let keyspace = self.collect_keyspace(lsn).await?; + let keyspace = self.collect_keyspace(lsn, ctx).await?; let partitioning = keyspace.partition(partition_size); let mut partitioning_guard = self.partitioning.lock().unwrap(); @@ -2166,13 +2326,15 @@ impl Timeline { // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + let threshold = self.get_image_creation_threshold(); + let num_deltas = + layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; debug!( "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", img_range.start, img_range.end, num_deltas, img_lsn, lsn ); - if num_deltas >= self.get_image_creation_threshold() { + if num_deltas >= threshold { return Ok(true); } } @@ -2187,6 +2349,7 @@ impl Timeline { partitioning: &KeyPartitioning, lsn: Lsn, force: bool, + ctx: &RequestContext, ) -> Result, PageReconstructError> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); @@ -2211,7 +2374,7 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = match self.get(key, lsn).await { + let img = match self.get(key, lsn, ctx).await { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -2267,21 +2430,23 @@ impl Timeline { let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); let mut layers = self.layers.write().unwrap(); + let mut updates = layers.batch_update(); let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); for l in image_layers { let path = l.filename(); let metadata = timeline_path .join(path.file_name()) .metadata() - .context("reading metadata of layer file {path}")?; + .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?; layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); self.metrics .resident_physical_size_gauge .add(metadata.len()); - layers.insert_historic(Arc::new(l)); + updates.insert_historic(Arc::new(l)); } + updates.flush(); drop(layers); timer.stop_and_record(); @@ -2577,6 +2742,7 @@ impl Timeline { } let mut layers = self.layers.write().unwrap(); + let mut updates = layers.batch_update(); let mut new_layer_paths = HashMap::with_capacity(new_layers.len()); for l in new_layers { let new_delta_path = l.path(); @@ -2597,7 +2763,7 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); - layers.insert_historic(x); + updates.insert_historic(x); } // Now that we have reshuffled the data to set of new delta layers, we can @@ -2611,8 +2777,9 @@ impl Timeline { } layer_names_to_delete.push(l.filename()); l.delete()?; - layers.remove_historic(l); + updates.remove_historic(l); } + updates.flush(); drop(layers); // Also schedule the deletions in remote storage @@ -2662,6 +2829,7 @@ impl Timeline { retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, + ctx: &RequestContext, ) -> anyhow::Result<()> { // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // @@ -2674,7 +2842,7 @@ impl Timeline { if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp).await? { + match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? { LsnForTimestamp::Present(lsn) => lsn, LsnForTimestamp::Future(lsn) => { // The timestamp is in the future. That sounds impossible, @@ -2725,6 +2893,8 @@ impl Timeline { /// obsolete. /// pub(super) async fn gc(&self) -> anyhow::Result { + let timer = self.metrics.garbage_collect_histo.start_timer(); + fail_point!("before-timeline-gc"); let _layer_removal_cs = self.layer_removal_cs.lock().await; @@ -2745,11 +2915,17 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); - self.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) + let res = self + .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) .instrument( info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff), ) - .await + .await?; + + // only record successes + timer.stop_and_record(); + + Ok(res) } async fn gc_timeline( @@ -2812,6 +2988,7 @@ impl Timeline { // 3. it doesn't need to be retained for 'retain_lsns'; // 4. newer on-disk image layers cover the layer's whole key range // + // TODO holding a write lock is too agressive and avoidable let mut layers = self.layers.write().unwrap(); 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -2843,6 +3020,8 @@ impl Timeline { // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. + // + // TODO Vec is not a great choice for `retain_lsns` for retain_lsn in &retain_lsns { // start_lsn is inclusive if &l.get_lsn_range().start <= retain_lsn { @@ -2896,6 +3075,7 @@ impl Timeline { layers_to_remove.push(Arc::clone(&l)); } + let mut updates = layers.batch_update(); if !layers_to_remove.is_empty() { // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. @@ -2913,7 +3093,13 @@ impl Timeline { } layer_names_to_delete.push(doomed_layer.filename()); doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning? - layers.remove_historic(doomed_layer); + + // TODO Removing from the bottom of the layer map is expensive. + // Maybe instead discard all layer map historic versions that + // won't be needed for page reconstruction for this timeline, + // and mark what we can't delete yet as deleted from the layer + // map index without actually rebuilding the index. + updates.remove_historic(doomed_layer); result.layers_removed += 1; } @@ -2925,6 +3111,7 @@ impl Timeline { remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; } } + updates.flush(); info!( "GC completed removing {} layers, cutoff {}", @@ -3081,11 +3268,13 @@ impl Timeline { // Delta- or ImageLayer in the layer map. let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); let mut layers = self_clone.layers.write().unwrap(); + let mut updates = layers.batch_update(); { let l: Arc = remote_layer.clone(); - layers.remove_historic(l); + updates.remove_historic(l); } - layers.insert_historic(new_layer); + updates.insert_historic(new_layer); + updates.flush(); drop(layers); // Now that we've inserted the download into the layer map, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs similarity index 83% rename from pageserver/src/walreceiver.rs rename to pageserver/src/tenant/timeline/walreceiver.rs index fc9daadc5c..f33a12c5cc 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,58 +23,15 @@ mod connection_manager; mod walreceiver_connection; -use crate::config::PageServerConf; use crate::task_mgr::WALRECEIVER_RUNTIME; -use anyhow::Context; -use once_cell::sync::OnceCell; use std::future::Future; -use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; pub use connection_manager::spawn_connection_manager_task; -static BROKER_CLIENT: OnceCell = OnceCell::new(); - -/// -/// Initialize the broker client. This must be called once at page server startup. -/// -pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> { - let broker_endpoint = conf.broker_endpoint.clone(); - - // Note: we do not attempt connecting here (but validate endpoints sanity). - let broker_client = - storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context( - format!( - "Failed to create broker client to {}", - &conf.broker_endpoint - ), - )?; - - if BROKER_CLIENT.set(broker_client).is_err() { - panic!("broker already initialized"); - } - - info!( - "Initialized broker client with endpoints: {}", - broker_endpoint - ); - Ok(()) -} - -/// -/// Get a handle to the broker client -/// -pub fn get_broker_client() -> &'static BrokerClientChannel { - BROKER_CLIENT.get().expect("broker client not initialized") -} - -pub fn is_broker_client_initialized() -> bool { - BROKER_CLIENT.get().is_some() -} - /// A handle of an asynchronous task. /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] /// and a cancellation token that it can listen to for earlier interrupts. @@ -95,7 +52,6 @@ pub enum TaskEvent { #[derive(Debug, Clone)] pub enum TaskStateUpdate { - Init, Started, Progress(E), } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs similarity index 96% rename from pageserver/src/walreceiver/connection_manager.rs rename to pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 8b60e59305..cd7c7c51d2 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -11,10 +11,12 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; -use crate::task_mgr::TaskKind; +use super::TaskStateUpdate; +use crate::broker_client::get_broker_client; +use crate::context::RequestContext; use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::task_mgr::{self, TaskKind}; use crate::tenant::Timeline; -use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; @@ -27,10 +29,7 @@ use storage_broker::Streaming; use tokio::{select, sync::watch}; use tracing::*; -use crate::{ - exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, -}; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use postgres_connection::{parse_host_port, PgConnectionConfig}; use utils::{ id::{NodeId, TenantTimelineId}, @@ -46,6 +45,7 @@ pub fn spawn_connection_manager_task( lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, auth_token: Option>, + ctx: RequestContext, ) { let mut broker_client = get_broker_client().clone(); @@ -78,6 +78,7 @@ pub fn spawn_connection_manager_task( loop_step_result = connection_manager_loop_step( &mut broker_client, &mut walreceiver_state, + &ctx, ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { @@ -101,6 +102,7 @@ pub fn spawn_connection_manager_task( async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, walreceiver_state: &mut WalreceiverState, + ctx: &RequestContext, ) -> ControlFlow<(), ()> { let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); @@ -145,7 +147,7 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {}, + TaskEvent::Update(TaskStateUpdate::Started) => {}, TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => { if new_status.has_processed_wal { // We have advanced last_record_lsn by processing the WAL received @@ -183,13 +185,23 @@ async fn connection_manager_loop_step( new_event = async { loop { + if walreceiver_state.timeline.current_state() == TimelineState::Loading { + warn!("wal connection manager should only be launched after timeline has become active"); + } match timeline_state_updates.changed().await { Ok(()) => { let new_state = walreceiver_state.timeline.current_state(); match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, - TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state), + TimelineState::Broken | TimelineState::Stopping => { + info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + TimelineState::Loading => { + warn!("timeline transitioned back to Loading state, that should not happen"); + return ControlFlow::Continue(new_state); + } } } Err(_sender_dropped_error) => return ControlFlow::Break(()), @@ -197,7 +209,7 @@ async fn connection_manager_loop_step( } } => match new_event { ControlFlow::Continue(new_state) => { - info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"); + info!("observed timeline state change, new state is {new_state:?}"); return ControlFlow::Continue(()); } ControlFlow::Break(()) => { @@ -226,6 +238,7 @@ async fn connection_manager_loop_step( .change_connection( new_candidate.safekeeper_id, new_candidate.wal_source_connconf, + ctx, ) .await } @@ -289,7 +302,9 @@ async fn subscribe_for_timeline_updates( return resp.into_inner(); } Err(e) => { - warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); + // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and + // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. + info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); continue; } } @@ -389,12 +404,17 @@ impl WalreceiverState { &mut self, new_sk_id: NodeId, new_wal_source_connconf: PgConnectionConfig, + ctx: &RequestContext, ) { self.drop_old_connection(true).await; let id = self.id; let connect_timeout = self.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); + let ctx = ctx.detached_child( + TaskKind::WalReceiverConnectionHandler, + ctx.download_behavior(), + ); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( @@ -403,6 +423,7 @@ impl WalreceiverState { events_sender, cancellation, connect_timeout, + ctx, ) .await .context("walreceiver connection handling failure") @@ -1233,18 +1254,18 @@ mod tests { const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) + .expect("Failed to create an empty timeline for dummy wal connection manager"); + let timeline = timeline.initialize(&ctx).unwrap(); + WalreceiverState { id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - timeline: harness - .load() - .await - .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) - .expect("Failed to create an empty timeline for dummy wal connection manager") - .initialize() - .unwrap(), + timeline, wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs similarity index 94% rename from pageserver/src/walreceiver/walreceiver_connection.rs rename to pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 1b9e4923fb..7e06c398af 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -22,7 +22,9 @@ use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; -use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; +use super::TaskStateUpdate; +use crate::context::RequestContext; +use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ task_mgr, task_mgr::TaskKind, @@ -62,6 +64,7 @@ pub async fn handle_walreceiver_connection( events_sender: watch::Sender>, cancellation: CancellationToken, connect_timeout: Duration, + ctx: RequestContext, ) -> anyhow::Result<()> { // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); @@ -77,9 +80,13 @@ pub async fn handle_walreceiver_connection( info!("DB connection stream finished: {expected_error}"); return Ok(()); } - Err(elapsed) => anyhow::bail!( - "Timed out while waiting {elapsed} for walreceiver connection to open" - ), + Err(_) => { + // Timing out to connect to a safekeeper node could happen long time, due to + // many reasons that pageserver cannot control. + // Do not produce an error, but make it visible, that timeouts happen by logging the `event. + info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open"); + return Ok(()); + } } }; @@ -99,10 +106,14 @@ pub async fn handle_walreceiver_connection( // The connection object performs the actual communication with the database, // so spawn it off to run on its own. + let _connection_ctx = ctx.detached_child( + TaskKind::WalReceiverConnectionPoller, + ctx.download_behavior(), + ); let connection_cancellation = cancellation.clone(); task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnection, + TaskKind::WalReceiverConnectionPoller, Some(timeline.tenant_id), Some(timeline.timeline_id), "walreceiver connection", @@ -117,7 +128,7 @@ pub async fn handle_walreceiver_connection( } } }, - + // Future: replace connection_cancellation with connection_ctx cancellation _ = connection_cancellation.cancelled() => info!("Connection cancelled"), } Ok(()) @@ -180,7 +191,7 @@ pub async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; while let Some(replication_message) = { select! { @@ -251,7 +262,7 @@ pub async fn handle_walreceiver_connection( ensure!(lsn.is_aligned()); walingest - .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded) + .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; @@ -329,7 +340,7 @@ pub async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. let (timeline_logical_size, _) = timeline - .get_current_logical_size() + .get_current_logical_size(&ctx) .context("Status update creation failed to get current logical size")?; let status_update = ReplicationFeedback { current_timeline_size: timeline_logical_size, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 0de2e6654d..3761c65668 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -29,6 +29,7 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use crate::context::RequestContext; use crate::pgdatadir_mapping::*; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; @@ -52,10 +53,14 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result { + pub async fn new( + timeline: &'a Timeline, + startpoint: Lsn, + ctx: &'_ RequestContext, + ) -> anyhow::Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); @@ -80,6 +85,7 @@ impl<'a> WalIngest<'a> { lsn: Lsn, modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, + ctx: &RequestContext, ) -> anyhow::Result<()> { modification.lsn = lsn; decode_wal_record(recdata, decoded, self.timeline.pg_version)?; @@ -97,7 +103,7 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, modification, decoded) + self.ingest_heapam_record(&mut buf, modification, decoded, ctx) .await?; } // Handle other special record types @@ -106,13 +112,14 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(modification, &create).await?; + self.ingest_xlog_smgr_create(modification, &create, ctx) + .await?; } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate) + self.ingest_xlog_smgr_truncate(modification, &truncate, ctx) .await?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( @@ -126,7 +133,7 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - self.ingest_xlog_dbase_create(modification, &createdb) + self.ingest_xlog_dbase_create(modification, &createdb, ctx) .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP @@ -134,7 +141,9 @@ impl<'a> WalIngest<'a> { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id).await?; + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; } } } else if self.timeline.pg_version == 15 { @@ -150,7 +159,7 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb) + self.ingest_xlog_dbase_create(modification, &createdb, ctx) .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP @@ -158,7 +167,9 @@ impl<'a> WalIngest<'a> { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id).await?; + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; } } } @@ -176,12 +187,13 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), + ctx, ) .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec) + self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { @@ -193,6 +205,7 @@ impl<'a> WalIngest<'a> { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, + ctx, ) .await?; } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED @@ -204,6 +217,7 @@ impl<'a> WalIngest<'a> { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, + ctx, ) .await?; // Remove twophase file. see RemoveTwoPhaseFile() in postgres code @@ -213,10 +227,12 @@ impl<'a> WalIngest<'a> { parsed_xact.xid, lsn, ); - modification.drop_twophase_file(parsed_xact.xid).await?; + modification + .drop_twophase_file(parsed_xact.xid, ctx) + .await?; } else if info == pg_constants::XLOG_XACT_PREPARE { modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])) + .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { @@ -232,6 +248,7 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), + ctx, ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { @@ -244,6 +261,7 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), + ctx, ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { @@ -251,12 +269,12 @@ impl<'a> WalIngest<'a> { self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(modification, &xlrec) + self.ingest_multixact_truncate_record(modification, &xlrec, ctx) .await?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded) + self.ingest_relmap_page(modification, &xlrec, decoded, ctx) .await?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -292,7 +310,7 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(modification, lsn, decoded, blk) + self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) .await?; } @@ -317,6 +335,7 @@ impl<'a> WalIngest<'a> { lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, + ctx: &RequestContext, ) -> Result<(), PageReconstructError> { let rel = RelTag { spcnode: blk.rnode_spcnode, @@ -359,14 +378,14 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - self.put_rel_page_image(modification, rel, blk.blkno, image.freeze()) + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx) .await?; } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - self.put_rel_wal_record(modification, rel, blk.blkno, rec) + self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx) .await?; } Ok(()) @@ -377,6 +396,7 @@ impl<'a> WalIngest<'a> { buf: &mut Bytes, modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -456,7 +476,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn).await?; + let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -481,6 +501,7 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, + ctx, ) .await?; } else { @@ -496,6 +517,7 @@ impl<'a> WalIngest<'a> { old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, + ctx, ) .await?; } @@ -509,6 +531,7 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, + ctx, ) .await?; } @@ -524,6 +547,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, rec: &XlCreateDatabase, + ctx: &RequestContext, ) -> anyhow::Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; @@ -539,7 +563,7 @@ impl<'a> WalIngest<'a> { let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn) + .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx) .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); @@ -547,10 +571,10 @@ impl<'a> WalIngest<'a> { // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx) .await?; modification - .put_relmap_file(tablespace_id, db_id, filemap) + .put_relmap_file(tablespace_id, db_id, filemap, ctx) .await?; let mut num_rels_copied = 0; @@ -561,7 +585,7 @@ impl<'a> WalIngest<'a> { let nblocks = modification .tline - .get_rel_size(src_rel, req_lsn, true) + .get_rel_size(src_rel, req_lsn, true, ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -570,7 +594,7 @@ impl<'a> WalIngest<'a> { forknum: src_rel.forknum, }; - modification.put_rel_creation(dst_rel, nblocks).await?; + modification.put_rel_creation(dst_rel, nblocks, ctx).await?; // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); @@ -579,7 +603,7 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -599,6 +623,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, rec: &XlSmgrCreate, + ctx: &RequestContext, ) -> anyhow::Result<()> { let rel = RelTag { spcnode: rec.rnode.spcnode, @@ -606,7 +631,7 @@ impl<'a> WalIngest<'a> { relnode: rec.rnode.relnode, forknum: rec.forknum, }; - self.put_rel_creation(modification, rel).await?; + self.put_rel_creation(modification, rel, ctx).await?; Ok(()) } @@ -617,6 +642,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, rec: &XlSmgrTruncate, + ctx: &RequestContext, ) -> anyhow::Result<()> { let spcnode = rec.rnode.spcnode; let dbnode = rec.rnode.dbnode; @@ -629,7 +655,7 @@ impl<'a> WalIngest<'a> { relnode, forknum: MAIN_FORKNUM, }; - self.put_rel_truncation(modification, rel, rec.blkno) + self.put_rel_truncation(modification, rel, rec.blkno, ctx) .await?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 { @@ -648,10 +674,10 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn).await?; + let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position - self.put_rel_truncation(modification, rel, fsm_physical_page_no) + self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx) .await?; } } @@ -670,10 +696,10 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn).await?; + let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position - self.put_rel_truncation(modification, rel, vm_page_no) + self.put_rel_truncation(modification, rel, vm_page_no, ctx) .await?; } } @@ -687,6 +713,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; @@ -745,10 +772,10 @@ impl<'a> WalIngest<'a> { let last_lsn = self.timeline.get_last_record_lsn(); if modification .tline - .get_rel_exists(rel, last_lsn, true) + .get_rel_exists(rel, last_lsn, true, ctx) .await? { - self.put_rel_drop(modification, rel).await?; + self.put_rel_drop(modification, rel, ctx).await?; } } } @@ -759,6 +786,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, xlrec: &XlClogTruncate, + ctx: &RequestContext, ) -> anyhow::Result<()> { info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", @@ -799,16 +827,15 @@ impl<'a> WalIngest<'a> { // it. So we use the previous record's LSN in the get calls // instead. let req_lsn = modification.tline.get_last_record_lsn(); - - let slru_segments = modification + for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn) - .await?; - for segno in slru_segments { + .list_slru_segments(SlruKind::Clog, req_lsn, ctx) + .await? + { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { modification - .drop_slru_segment(SlruKind::Clog, segno) + .drop_slru_segment(SlruKind::Clog, segno, ctx) .await?; trace!("Drop CLOG segment {:>04X}", segno); } @@ -900,6 +927,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, + ctx: &RequestContext, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db; @@ -915,7 +943,7 @@ impl<'a> WalIngest<'a> { // contain, possibly partially, valid data. while segment != endsegment { modification - .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32) + .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx) .await?; /* move to next segment, handling wraparound correctly */ @@ -937,6 +965,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, + ctx: &RequestContext, ) -> Result<()> { let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -944,18 +973,22 @@ impl<'a> WalIngest<'a> { buf.advance(12); modification - .put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..])) - .await?; - - Ok(()) + .put_relmap_file( + xlrec.tsid, + xlrec.dbid, + Bytes::copy_from_slice(&buf[..]), + ctx, + ) + .await } async fn put_rel_creation( &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, + ctx: &RequestContext, ) -> Result<()> { - modification.put_rel_creation(rel, 0).await?; + modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } @@ -965,8 +998,10 @@ impl<'a> WalIngest<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - self.handle_rel_extend(modification, rel, blknum).await?; + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + self.handle_rel_extend(modification, rel, blknum, ctx) + .await?; modification.put_rel_page_image(rel, blknum, img)?; Ok(()) } @@ -977,8 +1012,10 @@ impl<'a> WalIngest<'a> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { - self.handle_rel_extend(modification, rel, blknum).await?; + ctx: &RequestContext, + ) -> Result<()> { + self.handle_rel_extend(modification, rel, blknum, ctx) + .await?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -988,8 +1025,9 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, rel: RelTag, nblocks: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { - modification.put_rel_truncation(rel, nblocks).await?; + modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } @@ -997,17 +1035,22 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification<'_>, rel: RelTag, + ctx: &RequestContext, ) -> Result<()> { - modification.put_rel_drop(rel).await?; + modification.put_rel_drop(rel, ctx).await?; Ok(()) } - async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { - let exists = self.timeline.get_rel_exists(rel, lsn, true).await?; - let nblocks = if !exists { + async fn get_relsize( + &mut self, + rel: RelTag, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? { 0 } else { - self.timeline.get_rel_size(rel, lsn, true).await? + self.timeline.get_rel_size(rel, lsn, true, ctx).await? }; Ok(nblocks) } @@ -1017,23 +1060,28 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, - ) -> anyhow::Result<()> { + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? { + let old_nblocks = if !self + .timeline + .get_rel_exists(rel, last_lsn, true, ctx) + .await? + { // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0).await?; + modification.put_rel_creation(rel, 0, ctx).await?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn, true).await? + self.timeline.get_rel_size(rel, last_lsn, true, ctx).await? }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); - modification.put_rel_extend(rel, new_nblocks).await?; + modification.put_rel_extend(rel, new_nblocks, ctx).await?; // fill the gap with zeros for gap_blknum in old_nblocks..blknum { @@ -1050,8 +1098,9 @@ impl<'a> WalIngest<'a> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - self.handle_slru_extend(modification, kind, segno, blknum) + ctx: &RequestContext, + ) -> Result<()> { + self.handle_slru_extend(modification, kind, segno, blknum, ctx) .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; Ok(()) @@ -1063,6 +1112,7 @@ impl<'a> WalIngest<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, + ctx: &RequestContext, ) -> anyhow::Result<()> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens @@ -1075,17 +1125,17 @@ impl<'a> WalIngest<'a> { let last_lsn = self.timeline.get_last_record_lsn(); let old_nblocks = if !self .timeline - .get_slru_segment_exists(kind, segno, last_lsn) + .get_slru_segment_exists(kind, segno, last_lsn, ctx) .await? { // create it with 0 size initially, the logic below will extend it modification - .put_slru_segment_creation(kind, segno, 0) + .put_slru_segment_creation(kind, segno, 0, ctx) .await?; 0 } else { self.timeline - .get_slru_segment_size(kind, segno, last_lsn) + .get_slru_segment_size(kind, segno, last_lsn, ctx) .await? }; @@ -1134,41 +1184,44 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - async fn init_walingest_test(tline: &Timeline) -> Result { + async fn init_walingest_test<'a>( + tline: &'a Timeline, + ctx: &RequestContext, + ) -> Result> { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; - m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file + m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file m.commit()?; - let walingest = WalIngest::new(tline, Lsn(0x10)).await?; + let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; Ok(walingest) } #[tokio::test] async fn test_relsize() -> Result<()> { - let tenant = TenantHarness::create("test_relsize")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline).await?; + let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); - walingest.put_rel_creation(&mut m, TESTREL_A).await?; + walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?; walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) + .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) .await?; m.commit()?; @@ -1176,120 +1229,157 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false) + .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) .await .is_err()); - assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, true ); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, + 1 + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .await?, + 3 + ); // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false) + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx) .await?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false) + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false) + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false) + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false) + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false) + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); // Truncate last block let mut m = tline.begin_modification(Lsn(0x60)); - walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?; + walingest + .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx) + .await?; m.commit()?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false) + .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .await?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false) + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .await?, + 3 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); // Truncate to zero length let mut m = tline.begin_modification(Lsn(0x68)); - walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?; + walingest + .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx) + .await?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) + .await?, + 0 + ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) .await?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false) + .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) + .await?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false) + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx) .await?, TEST_IMG("foo blk 1") ); @@ -1297,21 +1387,26 @@ mod tests { // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) + .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) .await?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .await?, + 1501 + ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false) + .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false) + .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx) .await?, TEST_IMG("foo blk 1500") ); @@ -1323,31 +1418,40 @@ mod tests { // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { - let tenant = TenantHarness::create("test_drop_extend")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline).await?; + let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; m.commit()?; // Check that rel exists and size is correct assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, true ); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, + 1 + ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); - walingest.put_rel_drop(&mut m, TESTREL_A).await?; + walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?; m.commit()?; // Check that rel is not visible anymore assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx) + .await?, false ); @@ -1357,16 +1461,23 @@ mod tests { // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) .await?; m.commit()?; // Check that rel exists and size is correct assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx) + .await?, true ); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx) + .await?, + 1 + ); Ok(()) } @@ -1376,9 +1487,9 @@ mod tests { // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { - let tenant = TenantHarness::create("test_truncate_extend")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline).await?; + let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let mut walingest = init_walingest_test(&tline, &ctx).await?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; @@ -1386,27 +1497,33 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false) + .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) .await .is_err()); assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, true ); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .await?, relsize ); @@ -1416,7 +1533,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false) + .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx) .await?, TEST_IMG(&data) ); @@ -1425,18 +1542,25 @@ mod tests { // Truncate relation so that second segment was dropped // - only leave one page let mut m = tline.begin_modification(Lsn(0x60)); - walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?; + walingest + .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx) + .await?; m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .await?, + 1 + ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false) + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx) .await?, TEST_IMG(&data) ); @@ -1444,7 +1568,9 @@ mod tests { // should still see all blocks with older LSN assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .await?, relsize ); for blkno in 0..relsize { @@ -1452,7 +1578,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false) + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx) .await?, TEST_IMG(&data) ); @@ -1465,17 +1591,21 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } m.commit()?; assert_eq!( - tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?, + tline + .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx) + .await?, true ); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .await?, relsize ); // Check relation content @@ -1484,7 +1614,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false) + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx) .await?, TEST_IMG(&data) ); @@ -1497,9 +1627,9 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { - let tenant = TenantHarness::create("test_large_rel")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&tline).await?; + let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut lsn = 0x10; for blknum in 0..RELSEG_SIZE + 1 { @@ -1507,7 +1637,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest - .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) + .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; m.commit()?; } @@ -1515,7 +1645,7 @@ mod tests { assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE + 1 ); @@ -1523,11 +1653,11 @@ mod tests { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest - .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE) + .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx) .await?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -1536,11 +1666,11 @@ mod tests { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest - .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1) + .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx) .await?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE - 1 ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -1552,11 +1682,11 @@ mod tests { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest - .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber) + .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx) .await?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, size as BlockNumber ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index fd0524016f..c943bf0a27 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,16 +22,18 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; use serde::Serialize; +use std::collections::VecDeque; use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; -use std::sync::Mutex; +use std::sync::{Mutex, MutexGuard}; use std::time::Duration; use std::time::Instant; use std::{fs, io}; @@ -90,6 +92,20 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } +struct ProcessInput { + child: NoLeakChild, + stdin: ChildStdin, + stderr_fd: RawFd, + stdout_fd: RawFd, + n_requests: usize, +} + +struct ProcessOutput { + stdout: ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + /// /// This is the real implementation that uses a Postgres process to /// perform WAL replay. Only one thread can use the process at a time, @@ -101,7 +117,9 @@ pub struct PostgresRedoManager { tenant_id: TenantId, conf: &'static PageServerConf, - process: Mutex>, + stdout: Mutex>, + stdin: Mutex>, + stderr: Mutex>, } /// Can this request be served by neon redo functions @@ -209,16 +227,17 @@ impl PostgresRedoManager { PostgresRedoManager { tenant_id, conf, - process: Mutex::new(None), + stdin: Mutex::new(None), + stdout: Mutex::new(None), + stderr: Mutex::new(None), } } /// Launch process pre-emptively. Should not be needed except for benchmarking. - pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> { - let inner = self.process.get_mut().unwrap(); - if inner.is_none() { - let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?; - *inner = Some(p); + pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> { + let mut proc = self.stdin.lock().unwrap(); + if proc.is_none() { + self.launch(&mut proc, pg_version)?; } Ok(()) } @@ -241,22 +260,19 @@ impl PostgresRedoManager { let start_time = Instant::now(); - let mut process_guard = self.process.lock().unwrap(); + let mut proc = self.stdin.lock().unwrap(); let lock_time = Instant::now(); // launch the WAL redo process on first use - if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?; - *process_guard = Some(p); + if proc.is_none() { + self.launch(&mut proc, pg_version)?; } - let process = process_guard.as_mut().unwrap(); - WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); // Relational WAL records are applied using wal-redo-postgres let buf_tag = BufferTag { rel, blknum }; - let result = process - .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout) + let result = self + .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout) .map_err(WalRedoError::IoError); let end_time = Instant::now(); @@ -295,8 +311,22 @@ impl PostgresRedoManager { base_img_lsn, lsn ); - let process = process_guard.take().unwrap(); - process.kill(); + // self.stdin only holds stdin & stderr as_raw_fd(). + // Dropping it as part of take() doesn't close them. + // The owning objects (ChildStdout and ChildStderr) are stored in + // self.stdout and self.stderr, respsectively. + // We intentionally keep them open here to avoid a race between + // currently running `apply_wal_records()` and a `launch()` call + // after we return here. + // The currently running `apply_wal_records()` must not read from + // the newly launched process. + // By keeping self.stdout and self.stderr open here, `launch()` will + // get other file descriptors for the new child's stdout and stderr, + // and hence the current `apply_wal_records()` calls will observe + // `output.stdout.as_raw_fd() != stdout_fd` . + if let Some(proc) = self.stdin.lock().unwrap().take() { + proc.child.kill_and_wait(); + } } result } @@ -595,32 +625,23 @@ impl CloseFileDescriptors for C { } } -/// -/// Handle to the Postgres WAL redo process -/// -struct PostgresRedoProcess { - tenant_id: TenantId, - child: NoLeakChild, - stdin: ChildStdin, - stdout: ChildStdout, - stderr: ChildStderr, -} - -impl PostgresRedoProcess { +impl PostgresRedoManager { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] + #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))] fn launch( - conf: &PageServerConf, - tenant_id: TenantId, + &self, + input: &mut MutexGuard>, pg_version: u32, - ) -> Result { + ) -> Result<(), Error> { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. let datadir = path_with_suffix_extension( - conf.tenant_path(&tenant_id).join("wal-redo-datadir"), + self.conf + .tenant_path(&self.tenant_id) + .join("wal-redo-datadir"), TEMP_FILE_SUFFIX, ); @@ -634,10 +655,12 @@ impl PostgresRedoProcess { ) })?; } - let pg_bin_dir_path = conf + let pg_bin_dir_path = self + .conf .pg_bin_dir(pg_version) .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?; - let pg_lib_dir_path = conf + let pg_lib_dir_path = self + .conf .pg_lib_dir(pg_version) .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?; @@ -723,27 +746,31 @@ impl PostgresRedoProcess { // all fallible operations post-spawn are complete, so get rid of the guard let child = scopeguard::ScopeGuard::into_inner(child); - Ok(PostgresRedoProcess { - tenant_id, + **input = Some(ProcessInput { child, + stdout_fd: stdout.as_raw_fd(), + stderr_fd: stderr.as_raw_fd(), stdin, + n_requests: 0, + }); + + *self.stdout.lock().unwrap() = Some(ProcessOutput { stdout, - stderr, - }) + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }); + *self.stderr.lock().unwrap() = Some(stderr); + + Ok(()) } - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] - fn kill(self) { - self.child.kill_and_wait(); - } - - // // Apply given WAL records ('records') over an old page image. Returns // new page image. // - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))] fn apply_wal_records( - &mut self, + &self, + mut input: MutexGuard>, tag: BufferTag, base_img: Option, records: &[(Lsn, NeonWalRecord)], @@ -780,33 +807,23 @@ impl PostgresRedoProcess { build_get_page_msg(tag, &mut writebuf); WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - // The input is now in 'writebuf'. Do a blind write first, writing as much as - // we can, before calling poll(). That skips one call to poll() if the stdin is - // already available for writing, which it almost certainly is because the - // process is idle. - let mut nwrite = self.stdin.write(&writebuf)?; - - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far + let proc = input.as_mut().unwrap(); + let mut nwrite = 0usize; + let stdout_fd = proc.stdout_fd; // Prepare for calling poll() let mut pollfds = [ - PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN), - PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN), - PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT), + PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT), + PollFd::new(proc.stderr_fd, PollFlags::POLLIN), + PollFd::new(stdout_fd, PollFlags::POLLIN), ]; - // We do three things simultaneously: send the old base image and WAL records to - // the child process's stdin, read the result from child's stdout, and forward any logging + // We do two things simultaneously: send the old base image and WAL records to + // the child process's stdin and forward any logging // information that the child writes to its stderr to the page server's log. - while nresult < BLCKSZ.into() { - // If we have more data to write, wake up if 'stdin' becomes writeable or - // we have data to read. Otherwise only wake up if there's data to read. - let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; + while nwrite < writebuf.len() { let n = loop { - match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) { + match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) { Err(e) if e == nix::errno::Errno::EINTR => continue, res => break res, } @@ -820,14 +837,16 @@ impl PostgresRedoProcess { let err_revents = pollfds[1].revents().unwrap(); if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { let mut errbuf: [u8; 16384] = [0; 16384]; - let n = self.stderr.read(&mut errbuf)?; + let mut stderr_guard = self.stderr.lock().unwrap(); + let stderr = stderr_guard.as_mut().unwrap(); + let len = stderr.read(&mut errbuf)?; // The message might not be split correctly into lines here. But this is // good enough, the important thing is to get the message to the log. - if n > 0 { + if len > 0 { error!( "wal-redo-postgres: {}", - String::from_utf8_lossy(&errbuf[0..n]) + String::from_utf8_lossy(&errbuf[0..len]) ); // To make sure we capture all log from the process if it fails, keep @@ -841,33 +860,157 @@ impl PostgresRedoProcess { )); } - // If we have more data to write and 'stdin' is writeable, do write. - if nwrite < writebuf.len() { - let in_revents = pollfds[2].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += self.stdin.write(&writebuf[nwrite..])?; - } else if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stdin unexpectedly", - )); - } - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += self.stdout.read(&mut resultbuf[nresult..])?; - } else if out_revents.contains(PollFlags::POLLHUP) { + // If 'stdin' is writeable, do write. + let in_revents = pollfds[0].revents().unwrap(); + if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { + nwrite += proc.stdin.write(&writebuf[nwrite..])?; + } else if in_revents.contains(PollFlags::POLLHUP) { + // We still have more data to write, but the process closed the pipe. return Err(Error::new( ErrorKind::BrokenPipe, - "WAL redo process closed its stdout unexpectedly", + "WAL redo process closed its stdin unexpectedly", )); } } + let request_no = proc.n_requests; + proc.n_requests += 1; + drop(input); - Ok(Bytes::from(resultbuf)) + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut output_guard = self.stdout.lock().unwrap(); + let output = output_guard.as_mut().unwrap(); + if output.stdout.as_raw_fd() != stdout_fd { + // If stdout file descriptor is changed then it means that walredo process is crashed and restarted. + // As far as ProcessInput and ProcessOutout are protected by different mutexes, + // it can happen that we send request to one process and waiting response from another. + // To prevent such situation we compare stdout file descriptors. + // As far as old stdout pipe is destroyed only after new one is created, + // it can not reuse the same file descriptor, so this check is safe. + // + // Cross-read this with the comment in apply_batch_postgres if result.is_err(). + // That's where we kill the child process. + return Err(Error::new( + ErrorKind::BrokenPipe, + "WAL redo process closed its stdout unexpectedly", + )); + } + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far + while nresult < BLCKSZ.into() { + // We do two things simultaneously: reading response from stdout + // and forward any logging information that the child writes to its stderr to the page server's log. + let n = loop { + match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) { + Err(e) if e == nix::errno::Errno::EINTR => continue, + res => break res, + } + }?; + + if n == 0 { + return Err(Error::new(ErrorKind::Other, "WAL redo timed out")); + } + + // If we have some messages in stderr, forward them to the log. + let err_revents = pollfds[1].revents().unwrap(); + if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { + let mut errbuf: [u8; 16384] = [0; 16384]; + let mut stderr_guard = self.stderr.lock().unwrap(); + let stderr = stderr_guard.as_mut().unwrap(); + let len = stderr.read(&mut errbuf)?; + + // The message might not be split correctly into lines here. But this is + // good enough, the important thing is to get the message to the log. + if len > 0 { + error!( + "wal-redo-postgres: {}", + String::from_utf8_lossy(&errbuf[0..len]) + ); + + // To make sure we capture all log from the process if it fails, keep + // reading from the stderr, before checking the stdout. + continue; + } + } else if err_revents.contains(PollFlags::POLLHUP) { + return Err(Error::new( + ErrorKind::BrokenPipe, + "WAL redo process closed its stderr unexpectedly", + )); + } + + // If we have some data in stdout, read it to the result buffer. + let out_revents = pollfds[2].revents().unwrap(); + if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { + nresult += output.stdout.read(&mut resultbuf[nresult..])?; + } else if out_revents.contains(PollFlags::POLLHUP) { + return Err(Error::new( + ErrorKind::BrokenPipe, + "WAL redo process closed its stdout unexpectedly", + )); + } + } + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + Ok(res) } } diff --git a/poetry.lock b/poetry.lock index edbcddd576..fc37124184 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,3 +1,21 @@ +[[package]] +name = "aiohttp" +version = "3.7.0" +description = "Async http client/server framework (asyncio)" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +async-timeout = ">=3.0,<4.0" +attrs = ">=17.3.0" +chardet = ">=2.0,<4.0" +multidict = ">=4.5,<7.0" +yarl = ">=1.0,<2.0" + +[package.extras] +speedups = ["aiodns", "brotlipy", "cchardet"] + [[package]] name = "aiopg" version = "1.3.4" @@ -41,11 +59,11 @@ six = ">=1.9.0" [[package]] name = "async-timeout" -version = "4.0.2" +version = "3.0.1" description = "Timeout context manager for asyncio programs" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.5.3" [[package]] name = "asyncpg" @@ -560,6 +578,14 @@ networkx = ">=2.4,<3.0" pyyaml = ">5.4" sarif-om = ">=1.0.4,<1.1.0" +[[package]] +name = "chardet" +version = "3.0.4" +description = "Universal encoding detector for Python 2 and 3" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "charset-normalizer" version = "2.1.0" @@ -939,6 +965,14 @@ server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)" ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "multidict" +version = "6.0.4" +description = "multidict implementation" +category = "main" +optional = false +python-versions = ">=3.7" + [[package]] name = "mypy" version = "0.991" @@ -1580,6 +1614,18 @@ category = "main" optional = false python-versions = ">=3.4" +[[package]] +name = "yarl" +version = "1.8.2" +description = "Yet another URL library" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" + [[package]] name = "zipp" version = "3.8.1" @@ -1595,9 +1641,44 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70" +content-hash = "0f7289ef9439d1d7cd36b07efb53741b773669b0f860189c800270b7def0c241" [metadata.files] +aiohttp = [ + {file = "aiohttp-3.7.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:72fe89f7e14939e896d984c4b592580f8cdfa7497feb1c0c24639a9c60be3eb9"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fdf778d4c4bf976e69a37213fe8083613d0851976ddcf485bd7c0650a43d3852"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:fee7b5e68939ffc09f9b29f167ed49c8b50de3eee0a1d8108b439ddd9963af46"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:dd64634713be409202058f2ea267dfbcdd74b387b8793425f21ef0266d45d0e9"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:713dd7fd70ddda9dc8d014c49dd0e55b58afe4e0cddb8722c7501f53edf30c3f"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d31c43f7c4948ce01957f9a1ceee0784e067778477557ebccdf805398331c1a1"}, + {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:5e26d6003eb6df304608d9fd9c9437065a8532d869a3ffcbd8113a3d710f8239"}, + {file = "aiohttp-3.7.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf08462cddd10ddd8ffe5cb5c1638bfa051290909ebedb31c06e46578b9b7529"}, + {file = "aiohttp-3.7.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:07bacf6721db51a4c6160ed3031a2a97910647969dafd7c653f600f3b542f463"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:245b58e30bc889d18b783db2f09ef1d814f466e15c84325410827451297003a0"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b392e5c3e122586c49cd8b9426f577bf4d51958933b839d158d28b69515af74e"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5b5c320621a171aa85f96909af28fbb5286bd6842066db3062b083ba92261256"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:97d2341d1360dbe2c5b1d94922f7d68f9ce2ded1daab88b9bdeb49ce419cdc1b"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:beda23f292716887532661dc19abb9db2302ccfbd671a080cd8f4be7463d0841"}, + {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cbcaae9a6f14f762348d19b2dce8162772c0b0a1739314e18492a308a22caf96"}, + {file = "aiohttp-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7a49ef7b691babc83db126db874fbf26ba2f781899b91399f9ff8b235f059245"}, + {file = "aiohttp-3.7.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f56892f57310415cf6a179eec3ea6c7a82a9d37fbc00894943ea3154011a6d2a"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:df1274b7620c32d3b15bfb0a8fb3165dd6cdc9c39f4db74d162f051c80826542"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a04ba359dc5f2e21b96bfc90c4a7665441441ba61b52e992b7799493889a3419"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:f548d7976d168f0f45ac5909ca5f606ae3f6f7aa1725b22504004a053b29a7d0"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:deef02e2a9f5095463098c7c22d5566f20a6e4e14fc0996c0c2efc74d461b680"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:fe44c96bc380588d36729392b602470d88a7c18e646e95dd4348cafe3900d91d"}, + {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:9210532e6e95b40d22a33415bb84423eef3f633b2d2339b97f3b26438eebc466"}, + {file = "aiohttp-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:a586e476a251483d222c73dfb2f27df90bc4ea1b8c7da9396236510e0d4046c8"}, + {file = "aiohttp-3.7.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:900012c5f12ff72b1453229afe288ddc9135176df8b3b3cc5b8f6cfde912aaa4"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:064d5f0738bcbab3e0c0ecf85c93b5ee1e07e124f994eaa03bf73687f3ecd9da"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:0a2edf27865e66a33f64fa793cd14d0aae8127ce20a858539e97c25b600556dc"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:eaa8ae734639d5a0a3b5e33a154b8bfef384cdc090706f95c387cae8b21af764"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:a8a42f05491d9c04a77806875a68f84fea9af7a59d47b7897cb166632f74606c"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:b19ded3f6957693b97ba8372aacb5b0021639bbd5e77b1e960796bcef5431969"}, + {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cefbd7ce7d1f1db43749a077e4970e29e2b631f367c9eff3862c3c886b4218dd"}, + {file = "aiohttp-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d64f7dfd4e326d9b0d11b07fcd5ebf78844ba3c8f7699f38b50b0e0db0ae68f"}, + {file = "aiohttp-3.7.0.tar.gz", hash = "sha256:176f1d2b2bc07044f4ed583216578a72a2bd35dffdeb92e0517d0aaa29d29549"}, +] aiopg = [ {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, @@ -1611,8 +1692,8 @@ allure-python-commons = [ {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, ] async-timeout = [ - {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, - {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, + {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"}, + {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"}, ] asyncpg = [ {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"}, @@ -1787,6 +1868,10 @@ cfn-lint = [ {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, ] +chardet = [ + {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"}, + {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"}, +] charset-normalizer = [ {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, @@ -1960,6 +2045,82 @@ moto = [ {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] +multidict = [ + {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, + {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, + {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"}, + {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"}, + {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"}, + {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"}, + {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"}, + {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"}, + {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"}, + {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"}, + {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"}, + {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"}, + {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"}, + {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"}, + {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, +] mypy = [ {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"}, {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"}, @@ -2412,6 +2573,82 @@ xmltodict = [ {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, ] +yarl = [ + {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"}, + {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"}, + {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"}, + {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"}, + {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"}, + {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"}, + {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"}, + {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"}, + {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"}, + {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"}, + {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"}, + {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"}, + {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"}, + {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"}, + {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"}, + {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"}, + {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"}, + {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"}, + {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"}, + {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"}, + {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"}, + {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"}, + {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"}, + {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"}, + {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"}, + {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"}, + {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"}, + {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"}, + {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"}, + {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"}, + {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"}, + {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"}, + {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"}, + {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"}, + {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"}, + {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"}, + {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"}, + {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"}, + {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"}, + {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"}, + {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"}, + {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"}, + {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"}, + {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"}, + {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"}, + {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"}, + {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"}, + {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"}, + {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"}, + {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"}, + {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"}, + {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"}, + {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"}, + {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"}, + {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"}, + {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"}, + {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"}, + {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"}, + {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"}, + {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"}, + {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"}, + {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"}, + {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"}, + {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"}, + {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"}, + {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"}, + {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"}, + {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"}, + {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"}, + {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"}, + {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"}, + {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"}, + {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"}, + {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"}, +] zipp = [ {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 5d44774df9..1b61ab108f 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -30,7 +30,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use tracing::{info, info_span, Instrument}; use utils::project_git_version; -use utils::sentry_init::{init_sentry, release_name}; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); @@ -49,7 +49,7 @@ async fn main() -> anyhow::Result<()> { .init(); // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(release_name!(), &[]); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); let arg_matches = cli().get_matches(); diff --git a/pyproject.toml b/pyproject.toml index b4fb7a9e7d..a817e9dda5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.4" types-toml = "^0.10.8" pytest-httpserver = "^1.0.6" +aiohttp = "3.7" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index b130ea86bd..1a068412c8 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -38,7 +38,7 @@ use utils::{ id::NodeId, logging::{self, LogFormat}, project_git_version, - sentry_init::{init_sentry, release_name}, + sentry_init::init_sentry, signals, tcp_listener, }; @@ -173,7 +173,10 @@ fn main() -> anyhow::Result<()> { }; // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]); + let _sentry_guard = init_sentry( + Some(GIT_VERSION.into()), + &[("node_id", &conf.my_id.to_string())], + ); start_safekeeper(conf) } diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py new file mode 100644 index 0000000000..5472d86d8f --- /dev/null +++ b/scripts/force_layer_download.py @@ -0,0 +1,324 @@ +import argparse +import asyncio +import json +import logging +import signal +import sys +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Awaitable, Dict, List, Tuple + +import aiohttp + + +class ClientException(Exception): + pass + + +class Client: + def __init__(self, pageserver_api_endpoint: str, max_concurrent_layer_downloads: int): + self.endpoint = pageserver_api_endpoint + self.max_concurrent_layer_downloads = max_concurrent_layer_downloads + self.sess = aiohttp.ClientSession() + + async def close(self): + await self.sess.close() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_t, exc_v, exc_tb): + await self.close() + + async def parse_response(self, resp, expected_type): + body = await resp.json() + if not resp.ok: + raise ClientException(f"Response: {resp} Body: {body}") + + if not isinstance(body, expected_type): + raise ClientException(f"expecting {expected_type.__name__}") + return body + + async def get_tenant_ids(self): + resp = await self.sess.get(f"{self.endpoint}/v1/tenant") + payload = await self.parse_response(resp=resp, expected_type=list) + return [t["id"] for t in payload] + + async def get_timeline_ids(self, tenant_id): + resp = await self.sess.get(f"{self.endpoint}/v1/tenant/{tenant_id}/timeline") + payload = await self.parse_response(resp=resp, expected_type=list) + return [t["timeline_id"] for t in payload] + + async def timeline_spawn_download_remote_layers(self, tenant_id, timeline_id, ongoing_ok=False): + resp = await self.sess.post( + f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + json={"max_concurrent_downloads": self.max_concurrent_layer_downloads}, + ) + body = await resp.json() + if resp.status == 409: + if not ongoing_ok: + raise ClientException("download already ongoing") + # response body has same shape for ongoing and newly created + elif not resp.ok: + raise ClientException(f"Response: {resp} Body: {body}") + + if not isinstance(body, dict): + raise ClientException("expecting dict") + + return body + + async def timeline_poll_download_remote_layers_status( + self, + tenant_id, + timeline_id, + ): + resp = await self.sess.get( + f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + body = await resp.json() + + if resp.status == 404: + return None + elif not resp.ok: + raise ClientException(f"Response: {resp} Body: {body}") + + return body + + +@dataclass +class Completed: + """The status dict returned by the API""" + + status: Dict[str, Any] + + +sigint_received = asyncio.Event() + + +async def do_timeline(client: Client, tenant_id, timeline_id): + """ + Spawn download_remote_layers task for given timeline, + then poll until the download has reached a terminal state. + + If the terminal state is not 'Completed', the method raises an exception. + The caller is responsible for inspecting `failed_download_count`. + + If there is already a task going on when this method is invoked, + it raises an exception. + """ + + # Don't start new downloads if user pressed SIGINT. + # This task will show up as "raised_exception" in the report. + if sigint_received.is_set(): + raise Exception("not starting because SIGINT received") + + # run downloads to completion + + status = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id) + if status is not None and status["state"] == "Running": + raise Exception("download is already running") + + spawned = await client.timeline_spawn_download_remote_layers( + tenant_id, timeline_id, ongoing_ok=False + ) + + while True: + st = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id) + logging.info(f"{tenant_id}:{timeline_id} state is: {st}") + + if spawned["task_id"] != st["task_id"]: + raise ClientException("download task ids changed while polling") + + if st["state"] == "Running": + await asyncio.sleep(10) + continue + + if st["state"] != "Completed": + raise ClientException( + f"download task reached terminal state != Completed: {st['state']}" + ) + + return Completed(st) + + +def handle_sigint(): + logging.info("SIGINT received, asyncio event set. Will not start new downloads.") + global sigint_received + sigint_received.set() + + +async def main(args): + async with Client(args.pageserver_http_endpoint, args.max_concurrent_layer_downloads) as client: + exit_code = await main_impl(args, args.report_output, client) + + return exit_code + + +async def taskq_handler(task_q, result_q): + while True: + try: + (id, fut) = task_q.get_nowait() + except asyncio.QueueEmpty: + logging.debug("taskq_handler observed empty task_q, returning") + return + logging.info(f"starting task {id}") + try: + res = await fut + except Exception as e: + res = e + result_q.put_nowait((id, res)) + + +async def print_progress(result_q, tasks): + while True: + await asyncio.sleep(10) + logging.info(f"{result_q.qsize()} / {len(tasks)} tasks done") + + +async def main_impl(args, report_out, client: Client): + """ + Returns OS exit status. + """ + tenant_and_timline_ids: List[Tuple[str, str]] = [] + # fill tenant_and_timline_ids based on spec + for spec in args.what: + comps = spec.split(":") + if comps == ["ALL"]: + logging.info("get tenant list") + tenant_ids = await client.get_tenant_ids() + get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids] + gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True) + assert len(tenant_ids) == len(gathered) + tenant_and_timline_ids = [] + for tid, tlids in zip(tenant_ids, gathered): + for tlid in tlids: + tenant_and_timline_ids.append((tid, tlid)) + elif len(comps) == 1: + tid = comps[0] + tlids = await client.get_timeline_ids(tid) + for tlid in tlids: + tenant_and_timline_ids.append((tid, tlid)) + elif len(comps) == 2: + tenant_and_timline_ids.append((comps[0], comps[1])) + else: + raise ValueError(f"invalid what-spec: {spec}") + + logging.info("expanded spec:") + for tid, tlid in tenant_and_timline_ids: + logging.info(f"{tid}:{tlid}") + + logging.info("remove duplicates after expanding spec") + tmp = list(set(tenant_and_timline_ids)) + assert len(tmp) <= len(tenant_and_timline_ids) + if len(tmp) != len(tenant_and_timline_ids): + logging.info(f"spec had {len(tenant_and_timline_ids) - len(tmp)} duplicates") + tenant_and_timline_ids = tmp + + logging.info("create tasks and process them at specified concurrency") + task_q: asyncio.Queue[Tuple[str, Awaitable[Any]]] = asyncio.Queue() + tasks = { + f"{tid}:{tlid}": do_timeline(client, tid, tlid) for tid, tlid in tenant_and_timline_ids + } + for task in tasks.items(): + task_q.put_nowait(task) + + result_q: asyncio.Queue[Tuple[str, Any]] = asyncio.Queue() + taskq_handlers = [] + for _ in range(0, args.concurrent_tasks): + taskq_handlers.append(taskq_handler(task_q, result_q)) + + print_progress_task = asyncio.create_task(print_progress(result_q, tasks)) + + await asyncio.gather(*taskq_handlers) + print_progress_task.cancel() + + logging.info("all tasks handled, generating report") + + results = [] + while True: + try: + results.append(result_q.get_nowait()) + except asyncio.QueueEmpty: + break + assert task_q.empty() + + report = defaultdict(list) + for id, result in results: + logging.info(f"result for {id}: {result}") + if isinstance(result, Completed): + if result.status["failed_download_count"] == 0: + report["completed_without_errors"].append(id) + else: + report["completed_with_download_errors"].append(id) + elif isinstance(result, Exception): + report["raised_exception"].append(id) + else: + raise ValueError("unexpected result type") + json.dump(report, report_out) + + logging.info("--------------------------------------------------------------------------------") + + report_success = len(report["completed_without_errors"]) == len(tenant_and_timline_ids) + if not report_success: + logging.error("One or more tasks encountered errors.") + else: + logging.info("All tasks reported success.") + logging.info("Inspect log for details and report file for JSON summary.") + + return report_success + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--report-output", + type=argparse.FileType("w"), + default="-", + help="where to write report output (default: stdout)", + ) + parser.add_argument( + "--pageserver-http-endpoint", + default="http://localhost:9898", + help="pageserver http endpoint, (default http://localhost:9898)", + ) + parser.add_argument( + "--concurrent-tasks", + required=False, + default=5, + type=int, + help="Max concurrent download tasks created & polled by this script", + ) + parser.add_argument( + "--max-concurrent-layer-downloads", + dest="max_concurrent_layer_downloads", + required=False, + default=8, + type=int, + help="Max concurrent download tasks spawned by pageserver. Each layer is a separate task.", + ) + + parser.add_argument( + "what", + nargs="+", + help="what to download: ALL|tenant_id|tenant_id:timeline_id", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="enable verbose logging", + ) + args = parser.parse_args() + + level = logging.INFO + if args.verbose: + level = logging.DEBUG + logging.basicConfig( + format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%d:%H:%M:%S", + level=level, + ) + + loop = asyncio.get_event_loop() + + loop.add_signal_handler(signal.SIGINT, handle_sigint) + sys.exit(asyncio.run(main(args))) diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 6d80e96bf1..e33369bbb1 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -45,7 +45,7 @@ use storage_broker::{ use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::project_git_version; -use utils::sentry_init::{init_sentry, release_name}; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); @@ -425,7 +425,7 @@ async fn http1_handler( #[tokio::main] async fn main() -> Result<(), Box> { // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(release_name!(), &[]); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); let args = Args::parse(); diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 8b78e06c22..bdaaa95216 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( "pageserver_remote_physical_size", ) +PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( + "pageserver_storage_operations_seconds_global_count", + "pageserver_storage_operations_seconds_global_sum", + "pageserver_storage_operations_seconds_global_bucket", +) + PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", "pageserver_resident_physical_size", @@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", - "pageserver_storage_operations_seconds_bucket", - "pageserver_storage_operations_seconds_count", - "pageserver_storage_operations_seconds_sum", + "pageserver_storage_operations_seconds_count_total", + "pageserver_storage_operations_seconds_sum_total", "pageserver_wait_lsn_seconds_bucket", "pageserver_wait_lsn_seconds_count", "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", + "pageserver_tenant_states_count", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, ) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 29cdcb18ce..cbbf01a285 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -2,7 +2,15 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnvBuilder, + RemoteStorageKind, + assert_tenant_status, + wait_for_upload, +) +from fixtures.types import Lsn +from fixtures.utils import wait_until def test_tenant_config(neon_env_builder: NeonEnvBuilder): @@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "compaction_period": 20, "compaction_threshold": 10, "gc_horizon": 67108864, - "gc_period": 100, + "gc_period": 60 * 60, "image_creation_threshold": 3, "pitr_interval": 604800, # 7 days }.items() @@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "pitr_interval": 60, }.items() ) + + +def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_creating_tenant_conf_after_attach", + ) + + env = neon_env_builder.init_start() + assert isinstance(env.remote_storage, LocalFsStorage) + + # tenant is created with defaults, as in without config file + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + config_path = env.repo_dir / "tenants" / str(tenant_id) / "config" + assert config_path.exists(), "config file is always initially created" + + http_client = env.pageserver.http_client() + + detail = http_client.timeline_detail(tenant_id, timeline_id) + last_record_lsn = Lsn(detail["last_record_lsn"]) + assert last_record_lsn.lsn_int != 0, "initdb must have executed" + + wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn) + + http_client.tenant_detach(tenant_id) + + assert not config_path.exists(), "detach did not remove config file" + + http_client.tenant_attach(tenant_id) + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(http_client, tenant_id, "Active"), + ) + + env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"}) + contents_first = config_path.read_text() + env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"}) + contents_later = config_path.read_text() + + # dont test applying the setting here, we have that another test case to show it + # we just care about being able to create the file + assert len(contents_first) > len(contents_later) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index db5bb679f2..6c3454b79b 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -6,6 +6,7 @@ from threading import Thread import asyncpg import pytest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -59,11 +60,11 @@ def test_tenant_reattach( # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start("main", tenant_id=tenant_id) - with pg.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # Wait for the all data to be processed by the pageserver and uploaded in remote storage wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -78,15 +79,34 @@ def test_tenant_reattach( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) + ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver") + tenant_metric_filter = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + } + pageserver_last_record_lsn_before_detach = int( + ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value + ) + pageserver_http.tenant_detach(tenant_id) pageserver_http.tenant_attach(tenant_id) - with pg.cursor() as cur: - assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 + time.sleep(1) # for metrics propagation - # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*") - assert env.pageserver.log_contains(".*download.*failed, will retry.*") + ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver") + pageserver_last_record_lsn = int( + ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value + ) + + assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 + + # Check that we had to retry the downloads + assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*") + assert env.pageserver.log_contains(".*download.*failed, will retry.*") num_connections = 10 @@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found") + env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found") # first check for non existing tenant tenant_id = TenantId.generate() @@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) - # the error will be printed to the log too + # the error will be printed to the log too env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") + # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that + env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") # Detach while running manual GC. # It should wait for manual GC to finish because it runs in a task associated with the tenant. diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 9477ae3c25..e56bb1b469 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,5 +1,6 @@ import os import shutil +import time from contextlib import closing from datetime import datetime from pathlib import Path @@ -8,6 +9,7 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import ( + PAGESERVER_GLOBAL_METRICS, PAGESERVER_PER_TENANT_METRICS, PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, parse_metrics, @@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" ) + # Test (a subset of) pageserver global metrics + for metric in PAGESERVER_GLOBAL_METRICS: + ps_samples = ps_metrics.query_all(metric, {}) + assert len(ps_samples) > 0 + for sample in ps_samples: + labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()]) + log.info(f"{sample.name}{{{labels}}} {sample.value}") + @pytest.mark.parametrize( "remote_storage_kind", @@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants( files_in_timelines_dir == 0 ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" - # Trigger timeline reinitialization after pageserver restart + # Trigger timeline re-initialization after pageserver restart env.postgres.stop_all() env.pageserver.stop() @@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants( broken_tenant["state"] == "Broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" + broken_tenant_status = client.tenant_status(tenant_without_timelines_dir) + assert ( + broken_tenant_status["state"] == "Broken" + ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" + + assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*") + [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] assert ( loaded_tenant["state"] == "Active" ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation" + + loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir) + assert ( + loaded_tenant_status["state"] == "Active" + ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active" + + time.sleep(1) # to allow metrics propagation + + ps_metrics = parse_metrics(client.get_metrics(), "pageserver") + broken_tenants_metric_filter = { + "tenant_id": str(tenant_without_timelines_dir), + "state": "broken", + } + active_tenants_metric_filter = { + "tenant_id": str(tenant_with_empty_timelines_dir), + "state": "active", + } + + tenant_active_count = int( + ps_metrics.query_one( + "pageserver_tenant_states_count", filter=active_tenants_metric_filter + ).value + ) + + assert ( + tenant_active_count == 1 + ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active" + + tenant_broken_count = int( + ps_metrics.query_one( + "pageserver_tenant_states_count", filter=broken_tenants_metric_filter + ).value + ) + + assert ( + tenant_broken_count == 1 + ), f"Tenant {tenant_without_timelines_dir} should have metric as broken" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f4b71ae9b7..3a852b2207 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] } crossbeam-utils = { version = "0.8" } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } +futures-executor = { version = "0.3" } futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } indexmap = { version = "1", default-features = false, features = ["std"] } @@ -31,17 +33,21 @@ memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128", "libm"] } +num-traits = { version = "0.2", features = ["i128"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-syntax = { version = "0.6" } +reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +ring = { version = "0.16", features = ["std"] } +rustls = { version = "0.20", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] } tokio-util = { version = "0.7", features = ["codec", "io"] } +tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" }