diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..29b04a3478 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
- # -n4 uses four processes to run tests via pytest-xdist
- EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+ # -n16 uses sixteen processes to run tests via pytest-xdist
+ EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
index 4adc685684..a17dc9c78f 100644
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
shell:
cmd: |
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
- curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+ curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+ curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
tags:
- pageserver
@@ -186,6 +187,7 @@
shell:
cmd: |
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
- curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+ curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+ curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
tags:
- safekeeper
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index cb062f705d..157ae66ed1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
authBackend: "link"
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
uri: "https://console.stage.neon.tech/psql_session/"
+ domain: "pg.neon.build"
sentryEnvironment: "staging"
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
metricCollectionInterval: "1min"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
similarity index 80%
rename from .github/helm-values/production.proxy.yaml
rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
index dbaf3cd096..eff24302bb 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
@@ -1,37 +1,37 @@
+# Helm chart values for neon-proxy-link.
+# This is a YAML-formatted file.
+
+image:
+ repository: neondatabase/neon
+
settings:
authBackend: "link"
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
uri: "https://console.neon.tech/psql_session/"
+ domain: "pg.neon.tech"
sentryEnvironment: "production"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: production
- zenith_region: us-west-2
- zenith_region_slug: oregon
+ zenith_region: us-east-2
+ zenith_region_slug: us-east-2
service:
+ type: LoadBalancer
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
- external-dns.alpha.kubernetes.io/hostname: proxy-release.local
- type: LoadBalancer
+ external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
- external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
-
-metrics:
- enabled: true
- serviceMonitor:
- enabled: true
- selector:
- release: kube-prometheus-stack
+ external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
new file mode 100644
index 0000000000..3a5cde4b01
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,61 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+ repository: neondatabase/neon
+
+settings:
+ authBackend: "console"
+ authEndpoint: "http://console-release.local/management/api/v2"
+ domain: "*.cloud.neon.tech"
+ sentryEnvironment: "production"
+ wssPort: 8443
+ metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+ metricCollectionInterval: "10min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+ zenith_service: proxy-scram
+ zenith_env: prod
+ zenith_region: us-west-2
+ zenith_region_slug: us-west-2
+
+exposedService:
+ annotations:
+ service.beta.kubernetes.io/aws-load-balancer-type: external
+ service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+ service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+ external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
+ httpsPort: 443
+
+#metrics:
+# enabled: true
+# serviceMonitor:
+# enabled: true
+# selector:
+# release: kube-prometheus-stack
+
+extraManifests:
+ - apiVersion: operator.victoriametrics.com/v1beta1
+ kind: VMServiceScrape
+ metadata:
+ name: "{{ include \"neon-proxy.fullname\" . }}"
+ labels:
+ helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+ app.kubernetes.io/name: neon-proxy
+ app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+ app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+ app.kubernetes.io/managed-by: Helm
+ namespace: "{{ .Release.Namespace }}"
+ spec:
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: "neon-proxy"
+ endpoints:
+ - port: http
+ path: /metrics
+ interval: 10s
+ scrapeTimeout: 10s
+ namespaceSelector:
+ matchNames:
+ - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 918e568e27..89e12360f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: Test and Deploy
+name: Build and Test
on:
push:
@@ -19,10 +19,12 @@ concurrency:
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs:
tag:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -50,7 +52,7 @@ jobs:
id: build-tag
check-codestyle-python:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
options: --init
@@ -85,7 +87,7 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -97,16 +99,16 @@ jobs:
submodules: true
fetch-depth: 1
- - name: Restore cargo deps cache
- id: cache_cargo
- uses: actions/cache@v3
- with:
- path: |
- ~/.cargo/registry/
- !~/.cargo/registry/src
- ~/.cargo/git/
- target/
- key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+# Disabled for now
+# - name: Restore cargo deps cache
+# id: cache_cargo
+# uses: actions/cache@v3
+# with:
+# path: |
+# !~/.cargo/registry/src
+# ~/.cargo/git/
+# target/
+# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
@@ -133,7 +135,7 @@ jobs:
run: cargo deny check
build-neon:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -141,7 +143,6 @@ jobs:
fail-fast: false
matrix:
build_type: [ debug, release ]
-
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.sha }}
@@ -194,24 +195,26 @@ jobs:
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+ echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+ # Disabled for now
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
- - name: Cache cargo deps
- id: cache_cargo
- uses: actions/cache@v3
- with:
- path: |
- ~/.cargo/registry/
- !~/.cargo/registry/src
- ~/.cargo/git/
- target/
- # Fall back to older versions of the key, if no cache for current Cargo.lock was found
- key: |
- v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
- v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+# - name: Cache cargo deps
+# id: cache_cargo
+# uses: actions/cache@v3
+# with:
+# path: |
+# ~/.cargo/registry/
+# !~/.cargo/registry/src
+# ~/.cargo/git/
+# target/
+# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+# key: |
+# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
- name: Cache postgres v14 build
id: cache_pg_14
@@ -301,7 +304,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -334,7 +337,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
benchmarks:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -365,7 +368,7 @@ jobs:
# while coverage is currently collected for the debug ones
merge-allure-report:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -402,7 +405,7 @@ jobs:
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -418,16 +421,17 @@ jobs:
submodules: true
fetch-depth: 1
- - name: Restore cargo deps cache
- id: cache_cargo
- uses: actions/cache@v3
- with:
- path: |
- ~/.cargo/registry/
- !~/.cargo/registry/src
- ~/.cargo/git/
- target/
- key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+# Disabled for now
+# - name: Restore cargo deps cache
+# id: cache_cargo
+# uses: actions/cache@v3
+# with:
+# path: |
+# ~/.cargo/registry/
+# !~/.cargo/registry/src
+# ~/.cargo/git/
+# target/
+# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -477,7 +481,7 @@ jobs:
}"
trigger-e2e-tests:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
@@ -522,9 +526,10 @@ jobs:
}"
neon-image:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
- container: gcr.io/kaniko-project/executor:v1.9.0-debug
+ # https://github.com/GoogleContainerTools/kaniko/issues/2005
+ container: gcr.io/kaniko-project/executor:v1.7.0-debug
defaults:
run:
shell: sh -eu {0}
@@ -540,12 +545,16 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build neon
- run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+ run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+
+ # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+ - name: Cleanup ECR folder
+ run: rm -rf ~/.ecr
compute-tools-image:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
- container: gcr.io/kaniko-project/executor:v1.9.0-debug
+ container: gcr.io/kaniko-project/executor:v1.7.0-debug
defaults:
run:
shell: sh -eu {0}
@@ -558,11 +567,14 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute tools
- run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+ run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+
+ - name: Cleanup ECR folder
+ run: rm -rf ~/.ecr
compute-node-image:
- runs-on: [ self-hosted, dev, x64 ]
- container: gcr.io/kaniko-project/executor:v1.9.0-debug
+ runs-on: [ self-hosted, gen3, large ]
+ container: gcr.io/kaniko-project/executor:v1.7.0-debug
needs: [ tag ]
strategy:
fail-fast: false
@@ -583,10 +595,13 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node with extensions
- run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+ run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+ - name: Cleanup ECR folder
+ run: rm -rf ~/.ecr
vm-compute-node-image:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, large ]
needs: [ tag, compute-node-image ]
strategy:
fail-fast: false
@@ -631,7 +646,7 @@ jobs:
test-images:
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
steps:
- name: Checkout
@@ -673,20 +688,39 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml down
promote-images:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
needs: [ tag, test-images, vm-compute-node-image ]
+ container: golang:1.19-bullseye
if: github.event_name != 'workflow_dispatch'
- container: amazon/aws-cli
- strategy:
- fail-fast: false
- matrix:
- name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
steps:
- - name: Promote image to latest
+ - name: Install Crane & ECR helper
+ if: |
+ (github.ref_name == 'main' || github.ref_name == 'release') &&
+ github.event_name != 'workflow_dispatch'
run: |
- export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
- aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+ go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+ go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+ - name: Configure ECR login
+ run: |
+ mkdir /github/home/.docker/
+ echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+ - name: Add latest tag to images
+ if: |
+ (github.ref_name == 'main' || github.ref_name == 'release') &&
+ github.event_name != 'workflow_dispatch'
+ run: |
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+ crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
+ - name: Cleanup ECR folder
+ run: rm -rf ~/.ecr
push-docker-hub:
runs-on: [ self-hosted, dev, x64 ]
@@ -776,114 +810,11 @@ jobs:
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
- calculate-deploy-targets:
- runs-on: [ self-hosted, dev, x64 ]
- if: |
- github.ref_name == 'release' &&
- github.event_name != 'workflow_dispatch'
- outputs:
- matrix-include: ${{ steps.set-matrix.outputs.include }}
- steps:
- - id: set-matrix
- run: |
- if [[ "$GITHUB_REF_NAME" == "release" ]]; then
- PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
- echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
- else
- echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
- exit 1
- fi
-
- deploy:
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
- # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
- # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
- needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
- if: |
- github.ref_name == 'release' &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
- environment:
- name: prod-old
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Redeploy
- run: |
- export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- cd "$(pwd)/.github/ansible"
-
- if [[ "$GITHUB_REF_NAME" == "main" ]]; then
- ./get_binaries.sh
- elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
- RELEASE=true ./get_binaries.sh
- else
- echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
- exit 1
- fi
-
- eval $(ssh-agent)
- echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
- echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
- chmod 0600 ssh-key
- ssh-add ssh-key
- rm -f ssh-key ssh-key-cert.pub
- ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
- ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
- rm -f neon_install.tar.gz .neon_current_version
-
- deploy-new:
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
- # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
- # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
- needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'main') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- target_region: [ eu-west-1, us-east-2 ]
- environment:
- name: dev-${{ matrix.target_region }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Redeploy
- run: |
- export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- cd "$(pwd)/.github/ansible"
- if [[ "$GITHUB_REF_NAME" == "main" ]]; then
- ./get_binaries.sh
- elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
- RELEASE=true ./get_binaries.sh
- else
- echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
- exit 1
- fi
- ansible-galaxy collection install sivel.toiletwater
- ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
- rm -f neon_install.tar.gz .neon_current_version
+ - name: Cleanup ECR folder
+ run: rm -rf ~/.ecr
deploy-pr-test-new:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -915,311 +846,40 @@ jobs:
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
- deploy-prod-new:
- runs-on: prod
- container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
- # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
- # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+ - name: Cleanup ansible folder
+ run: rm -rf ~/.ansible
+
+ deploy:
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'release') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
- environment:
- name: prod-${{ matrix.target_region }}
+ if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Checkout
uses: actions/checkout@v3
with:
- submodules: true
+ submodules: false
fetch-depth: 0
- - name: Redeploy
+ - name: Trigger deploy workflow
+ env:
+ GH_TOKEN: ${{ github.token }}
run: |
- export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- cd "$(pwd)/.github/ansible"
-
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
- ./get_binaries.sh
+ gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
- RELEASE=true ./get_binaries.sh
+ gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
- ansible-galaxy collection install sivel.toiletwater
- ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
- rm -f neon_install.tar.gz .neon_current_version
-
- deploy-proxy:
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
- if: |
- github.ref_name == 'release' &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
- environment:
- name: prod-old
- env:
- KUBECONFIG: .kubeconfig
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Add curl
- run: apt update && apt install curl -y
-
- - name: Store kubeconfig file
- run: |
- echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
- chmod 0600 ${KUBECONFIG}
-
- - name: Setup helm v3
- run: |
- curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
- - name: Re-deploy proxy
- run: |
- DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
- deploy-storage-broker:
- name: deploy storage broker on old staging and old prod
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
- if: |
- github.ref_name == 'release' &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
- environment:
- name: prod-old
- env:
- KUBECONFIG: .kubeconfig
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Add curl
- run: apt update && apt install curl -y
-
- - name: Store kubeconfig file
- run: |
- echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
- chmod 0600 ${KUBECONFIG}
-
- - name: Setup helm v3
- run: |
- curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
- - name: Deploy storage-broker
- run:
- helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
- deploy-proxy-new:
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'main') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include:
- - target_region: us-east-2
- target_cluster: dev-us-east-2-beta
- deploy_link_proxy: true
- deploy_legacy_scram_proxy: true
- - target_region: eu-west-1
- target_cluster: dev-eu-west-1-zeta
- deploy_link_proxy: false
- deploy_legacy_scram_proxy: false
- environment:
- name: dev-${{ matrix.target_region }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Configure environment
- run: |
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
- aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
-
- - name: Re-deploy scram proxy
- run: |
- DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
- - name: Re-deploy link proxy
- if: matrix.deploy_link_proxy
- run: |
- DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
- - name: Re-deploy legacy scram proxy
- if: matrix.deploy_legacy_scram_proxy
- run: |
- DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
- deploy-storage-broker-dev-new:
- runs-on: [ self-hosted, dev, x64 ]
- container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'main') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include:
- - target_region: us-east-2
- target_cluster: dev-us-east-2-beta
- - target_region: eu-west-1
- target_cluster: dev-eu-west-1-zeta
- environment:
- name: dev-${{ matrix.target_region }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Configure environment
- run: |
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
- aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
-
- - name: Deploy storage-broker
- run:
- helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
- deploy-proxy-prod-new:
- runs-on: prod
- container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'release') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include:
- - target_region: us-east-2
- target_cluster: prod-us-east-2-delta
- - target_region: us-west-2
- target_cluster: prod-us-west-2-eta
- - target_region: eu-central-1
- target_cluster: prod-eu-central-1-gamma
- - target_region: ap-southeast-1
- target_cluster: prod-ap-southeast-1-epsilon
- environment:
- name: prod-${{ matrix.target_region }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Configure environment
- run: |
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
- aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
-
- - name: Re-deploy proxy
- run: |
- DOCKER_TAG=${{needs.tag.outputs.build-tag}}
- helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
- deploy-storage-broker-prod-new:
- runs-on: prod
- container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
- # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
- needs: [ push-docker-hub, tag, regress-tests ]
- if: |
- (github.ref_name == 'release') &&
- github.event_name != 'workflow_dispatch'
- defaults:
- run:
- shell: bash
- strategy:
- matrix:
- include:
- - target_region: us-east-2
- target_cluster: prod-us-east-2-delta
- - target_region: us-west-2
- target_cluster: prod-us-west-2-eta
- - target_region: eu-central-1
- target_cluster: prod-eu-central-1-gamma
- - target_region: ap-southeast-1
- target_cluster: prod-ap-southeast-1-epsilon
- environment:
- name: prod-${{ matrix.target_region }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- submodules: true
- fetch-depth: 0
-
- - name: Configure environment
- run: |
- helm repo add neondatabase https://neondatabase.github.io/helm-charts
- aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
-
- - name: Deploy storage-broker
- run:
- helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
promote-compatibility-data:
- runs-on: [ self-hosted, dev, x64 ]
+ runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
- needs: [ deploy, deploy-proxy ]
+ needs: [ push-docker-hub, tag, regress-tests ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
steps:
- name: Promote compatibility snapshot for the release
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
new file mode 100644
index 0000000000..409517bf63
--- /dev/null
+++ b/.github/workflows/deploy-dev.yml
@@ -0,0 +1,179 @@
+name: Neon Deploy dev
+
+on:
+ workflow_dispatch:
+ inputs:
+ dockerTag:
+ description: 'Docker tag to deploy'
+ required: true
+ type: string
+ branch:
+ description: 'Branch or commit used for deploy scripts and configs'
+ required: true
+ type: string
+ default: 'main'
+ deployStorage:
+ description: 'Deploy storage'
+ required: true
+ type: boolean
+ default: true
+ deployProxy:
+ description: 'Deploy proxy'
+ required: true
+ type: boolean
+ default: true
+ deployStorageBroker:
+ description: 'Deploy storage-broker'
+ required: true
+ type: boolean
+ default: true
+
+env:
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+concurrency:
+ group: deploy-dev
+ cancel-in-progress: false
+
+jobs:
+ deploy-storage-new:
+ runs-on: [ self-hosted, gen3, small ]
+ container:
+ image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ options: --user root --privileged
+ if: inputs.deployStorage
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ target_region: [ eu-west-1, us-east-2 ]
+ environment:
+ name: dev-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Redeploy
+ run: |
+ export DOCKER_TAG=${{ inputs.dockerTag }}
+ cd "$(pwd)/.github/ansible"
+
+ ./get_binaries.sh
+
+ ansible-galaxy collection install sivel.toiletwater
+ ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+ rm -f neon_install.tar.gz .neon_current_version
+
+ - name: Cleanup ansible folder
+ run: rm -rf ~/.ansible
+
+ deploy-proxy-new:
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ if: inputs.deployProxy
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ include:
+ - target_region: us-east-2
+ target_cluster: dev-us-east-2-beta
+ deploy_link_proxy: true
+ deploy_legacy_scram_proxy: true
+ - target_region: eu-west-1
+ target_cluster: dev-eu-west-1-zeta
+ deploy_link_proxy: false
+ deploy_legacy_scram_proxy: false
+ environment:
+ name: dev-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v1-node16
+ with:
+ role-to-assume: arn:aws:iam::369495373322:role/github-runner
+ aws-region: eu-central-1
+ role-skip-session-tagging: true
+ role-duration-seconds: 1800
+
+ - name: Configure environment
+ run: |
+ helm repo add neondatabase https://neondatabase.github.io/helm-charts
+ aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
+
+ - name: Re-deploy scram proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Re-deploy link proxy
+ if: matrix.deploy_link_proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Re-deploy legacy scram proxy
+ if: matrix.deploy_legacy_scram_proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Cleanup helm folder
+ run: rm -rf ~/.cache
+
+ deploy-storage-broker-new:
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ if: inputs.deployStorageBroker
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ include:
+ - target_region: us-east-2
+ target_cluster: dev-us-east-2-beta
+ - target_region: eu-west-1
+ target_cluster: dev-eu-west-1-zeta
+ environment:
+ name: dev-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v1-node16
+ with:
+ role-to-assume: arn:aws:iam::369495373322:role/github-runner
+ aws-region: eu-central-1
+ role-skip-session-tagging: true
+ role-duration-seconds: 1800
+
+ - name: Configure environment
+ run: |
+ helm repo add neondatabase https://neondatabase.github.io/helm-charts
+ aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
+
+ - name: Deploy storage-broker
+ run:
+ helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+ - name: Cleanup helm folder
+ run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
new file mode 100644
index 0000000000..e1954b5540
--- /dev/null
+++ b/.github/workflows/deploy-prod.yml
@@ -0,0 +1,277 @@
+name: Neon Deploy prod
+
+on:
+ workflow_dispatch:
+ inputs:
+ dockerTag:
+ description: 'Docker tag to deploy'
+ required: true
+ type: string
+ branch:
+ description: 'Branch or commit used for deploy scripts and configs'
+ required: true
+ type: string
+ default: 'main'
+ deployStorage:
+ description: 'Deploy storage'
+ required: true
+ type: boolean
+ default: true
+ deployProxy:
+ description: 'Deploy proxy'
+ required: true
+ type: boolean
+ default: true
+ deployStorageBroker:
+ description: 'Deploy storage-broker'
+ required: true
+ type: boolean
+ default: true
+
+concurrency:
+ group: deploy-prod
+ cancel-in-progress: false
+
+jobs:
+ deploy-prod-new:
+ runs-on: prod
+ container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+ if: inputs.deployStorage
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+ environment:
+ name: prod-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Redeploy
+ run: |
+ export DOCKER_TAG=${{ inputs.dockerTag }}
+ cd "$(pwd)/.github/ansible"
+
+ ./get_binaries.sh
+
+ ansible-galaxy collection install sivel.toiletwater
+ ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+ rm -f neon_install.tar.gz .neon_current_version
+
+ deploy-proxy-prod-new:
+ runs-on: prod
+ container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+ if: inputs.deployProxy
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ include:
+ - target_region: us-east-2
+ target_cluster: prod-us-east-2-delta
+ deploy_link_proxy: true
+ deploy_legacy_scram_proxy: false
+ - target_region: us-west-2
+ target_cluster: prod-us-west-2-eta
+ deploy_link_proxy: false
+ deploy_legacy_scram_proxy: true
+ - target_region: eu-central-1
+ target_cluster: prod-eu-central-1-gamma
+ deploy_link_proxy: false
+ deploy_legacy_scram_proxy: false
+ - target_region: ap-southeast-1
+ target_cluster: prod-ap-southeast-1-epsilon
+ deploy_link_proxy: false
+ deploy_legacy_scram_proxy: false
+ environment:
+ name: prod-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Configure environment
+ run: |
+ helm repo add neondatabase https://neondatabase.github.io/helm-charts
+ aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
+
+ - name: Re-deploy scram proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Re-deploy link proxy
+ if: matrix.deploy_link_proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Re-deploy legacy scram proxy
+ if: matrix.deploy_legacy_scram_proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ deploy-storage-broker-prod-new:
+ runs-on: prod
+ container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+ if: inputs.deployStorageBroker
+ defaults:
+ run:
+ shell: bash
+ strategy:
+ matrix:
+ include:
+ - target_region: us-east-2
+ target_cluster: prod-us-east-2-delta
+ - target_region: us-west-2
+ target_cluster: prod-us-west-2-eta
+ - target_region: eu-central-1
+ target_cluster: prod-eu-central-1-gamma
+ - target_region: ap-southeast-1
+ target_cluster: prod-ap-southeast-1-epsilon
+ environment:
+ name: prod-${{ matrix.target_region }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Configure environment
+ run: |
+ helm repo add neondatabase https://neondatabase.github.io/helm-charts
+ aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
+
+ - name: Deploy storage-broker
+ run:
+ helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+ # Deploy to old account below
+
+ deploy:
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ if: inputs.deployStorage
+ defaults:
+ run:
+ shell: bash
+ environment:
+ name: prod-old
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Redeploy
+ run: |
+ export DOCKER_TAG=${{ inputs.dockerTag }}
+ cd "$(pwd)/.github/ansible"
+
+ if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+ ./get_binaries.sh
+ elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+ RELEASE=true ./get_binaries.sh
+ else
+ echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+ exit 1
+ fi
+
+ eval $(ssh-agent)
+ echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
+ echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+ chmod 0600 ssh-key
+ ssh-add ssh-key
+ rm -f ssh-key ssh-key-cert.pub
+ ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
+ ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+ rm -f neon_install.tar.gz .neon_current_version
+
+ # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
+ - name: Cleanup ansible folder
+ run: rm -rf ~/.ansible
+
+ deploy-proxy:
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ if: inputs.deployProxy
+ defaults:
+ run:
+ shell: bash
+ environment:
+ name: prod-old
+ env:
+ KUBECONFIG: .kubeconfig
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Store kubeconfig file
+ run: |
+ echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+ chmod 0600 ${KUBECONFIG}
+
+ - name: Add neon helm chart
+ run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+ - name: Re-deploy proxy
+ run: |
+ DOCKER_TAG=${{ inputs.dockerTag }}
+ helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+ - name: Cleanup helm folder
+ run: rm -rf ~/.cache
+
+ deploy-storage-broker:
+ name: deploy storage broker on old staging and old prod
+ runs-on: [ self-hosted, gen3, small ]
+ container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+ if: inputs.deployStorageBroker
+ defaults:
+ run:
+ shell: bash
+ environment:
+ name: prod-old
+ env:
+ KUBECONFIG: .kubeconfig
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ fetch-depth: 0
+ ref: ${{ inputs.branch }}
+
+ - name: Store kubeconfig file
+ run: |
+ echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+ chmod 0600 ${KUBECONFIG}
+
+ - name: Add neon helm chart
+ run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+ - name: Deploy storage-broker
+ run:
+ helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+ - name: Cleanup helm folder
+ run: rm -rf ~/.cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000000..49e04ee001
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,33 @@
+name: Create Release Branch
+
+on:
+ schedule:
+ - cron: '0 10 * * 2'
+
+jobs:
+ create_release_branch:
+ runs-on: [ubuntu-latest]
+
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v3
+ with:
+ ref: main
+
+ - name: Get current date
+ id: date
+ run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+ - name: Create release branch
+ run: git checkout -b release/${{ steps.date.outputs.date }}
+
+ - name: Push new branch
+ run: git push origin release/${{ steps.date.outputs.date }}
+
+ - name: Create pull request into release
+ uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+ with:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ head: release/${{ steps.date.outputs.date }}
+ base: release
+ title: Release ${{ steps.date.outputs.date }}
diff --git a/Cargo.lock b/Cargo.lock
index d8aba9ba68..2985a654f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
"memchr",
]
-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
"backtrace",
]
+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
[[package]]
name = "asn1-rs"
version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
"syn",
]
-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
[[package]]
name = "atty"
version = "0.2.14"
@@ -629,9 +624,9 @@ dependencies = [
[[package]]
name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
[[package]]
name = "byteorder"
@@ -750,13 +745,13 @@ dependencies = [
[[package]]
name = "clap"
-version = "4.0.32"
+version = "4.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
dependencies = [
"bitflags",
"clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
"is-terminal",
"once_cell",
"strsim",
@@ -765,9 +760,9 @@ dependencies = [
[[package]]
name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
dependencies = [
"heck",
"proc-macro-error",
@@ -787,9 +782,9 @@ dependencies = [
[[package]]
name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
dependencies = [
"os_str_bytes",
]
@@ -832,10 +827,11 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
"futures",
"hyper",
"notify",
+ "opentelemetry",
"postgres",
"regex",
"serde",
@@ -844,7 +840,9 @@ dependencies = [
"tokio",
"tokio-postgres",
"tracing",
+ "tracing-opentelemetry",
"tracing-subscriber",
+ "tracing-utils",
"url",
"workspace_hack",
]
@@ -887,7 +885,7 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
"comfy-table",
"git-version",
"nix",
@@ -988,12 +986,6 @@ dependencies = [
"itertools",
]
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
[[package]]
name = "crossbeam-channel"
version = "0.5.6"
@@ -1030,12 +1022,11 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
dependencies = [
"cfg-if",
- "once_cell",
]
[[package]]
@@ -1152,6 +1143,19 @@ dependencies = [
"syn",
]
+[[package]]
+name = "dashmap"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.12.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
[[package]]
name = "data-encoding"
version = "2.3.3"
@@ -1506,15 +1510,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
[[package]]
name = "hashbrown"
version = "0.12.3"
@@ -1530,19 +1525,6 @@ dependencies = [
"ahash",
]
-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
[[package]]
name = "heck"
version = "0.4.0"
@@ -1804,9 +1786,9 @@ dependencies = [
[[package]]
name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
dependencies = [
"libc",
"windows-sys",
@@ -1916,12 +1898,6 @@ dependencies = [
"winapi",
]
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
[[package]]
name = "link-cplusplus"
version = "1.0.8"
@@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "nix"
-version = "0.26.1"
+version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
dependencies = [
"bitflags",
"cfg-if",
@@ -2081,9 +2057,9 @@ dependencies = [
[[package]]
name = "nom"
-version = "7.1.2"
+version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
dependencies = [
"memchr",
"minimal-lexical",
@@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
- "libm",
]
[[package]]
@@ -2203,6 +2178,108 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+[[package]]
+name = "opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
+ "reqwest",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+dependencies = [
+ "async-trait",
+ "futures",
+ "futures-util",
+ "http",
+ "opentelemetry",
+ "opentelemetry-http",
+ "opentelemetry-proto",
+ "prost",
+ "reqwest",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+dependencies = [
+ "futures",
+ "futures-util",
+ "opentelemetry",
+ "prost",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+dependencies = [
+ "opentelemetry",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+dependencies = [
+ "fnv",
+ "futures-channel",
+ "futures-util",
+ "indexmap",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "dashmap",
+ "fnv",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+]
+
[[package]]
name = "os_info"
version = "3.5.1"
@@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
name = "pageserver"
version = "0.1.0"
dependencies = [
- "amplify_num",
"anyhow",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
"close_fds",
"const_format",
"consumption_metrics",
@@ -2269,7 +2345,7 @@ dependencies = [
"regex",
"remote_storage",
"reqwest",
- "rstar",
+ "rpds",
"scopeguard",
"serde",
"serde_json",
@@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
dependencies = [
"unicode-ident",
]
@@ -2683,7 +2759,7 @@ dependencies = [
"bstr",
"bytes",
"chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
"consumption_metrics",
"futures",
"git-version",
@@ -2742,14 +2818,13 @@ dependencies = [
[[package]]
name = "rand"
-version = "0.8.4"
+version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
- "rand_hc",
]
[[package]]
@@ -2771,15 +2846,6 @@ dependencies = [
"getrandom",
]
-[[package]]
-name = "rand_hc"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
-dependencies = [
- "rand_core",
-]
-
[[package]]
name = "rayon"
version = "1.6.1"
@@ -2930,7 +2996,7 @@ dependencies = [
"cc",
"libc",
"once_cell",
- "spin 0.5.2",
+ "spin",
"untrusted",
"web-sys",
"winapi",
@@ -2950,14 +3016,12 @@ dependencies = [
]
[[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
]
[[package]]
@@ -3018,9 +3082,9 @@ dependencies = [
[[package]]
name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
dependencies = [
"bitflags",
"errno",
@@ -3093,7 +3157,7 @@ dependencies = [
"async-trait",
"byteorder",
"bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
"const_format",
"crc32c",
"fs2",
@@ -3479,21 +3543,6 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -3507,7 +3556,7 @@ dependencies = [
"anyhow",
"async-stream",
"bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
"const_format",
"futures",
"futures-core",
@@ -3639,9 +3688,9 @@ dependencies = [
[[package]]
name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
@@ -3749,9 +3798,9 @@ dependencies = [
[[package]]
name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
dependencies = [
"autocfg",
"bytes",
@@ -4071,6 +4120,20 @@ dependencies = [
"tracing-core",
]
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
[[package]]
name = "tracing-serde"
version = "0.1.3"
@@ -4102,6 +4165,22 @@ dependencies = [
"tracing-serde",
]
+[[package]]
+name = "tracing-utils"
+version = "0.1.0"
+dependencies = [
+ "hyper",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-semantic-conventions",
+ "reqwest",
+ "tokio",
+ "tracing",
+ "tracing-opentelemetry",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
[[package]]
name = "try-lock"
version = "0.2.4"
@@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
dependencies = [
"base64 0.13.1",
"log",
@@ -4226,6 +4305,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
+ "atty",
"bincode",
"byteorder",
"bytes",
@@ -4287,7 +4367,7 @@ name = "wal_craft"
version = "0.1.0"
dependencies = [
"anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
"env_logger",
"log",
"once_cell",
@@ -4534,11 +4614,13 @@ dependencies = [
"anyhow",
"bytes",
"chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
"crossbeam-utils",
"either",
"fail",
+ "futures",
"futures-channel",
+ "futures-executor",
"futures-task",
"futures-util",
"indexmap",
@@ -4554,6 +4636,9 @@ dependencies = [
"rand",
"regex",
"regex-syntax",
+ "reqwest",
+ "ring",
+ "rustls",
"scopeguard",
"serde",
"serde_json",
@@ -4561,6 +4646,7 @@ dependencies = [
"syn",
"tokio",
"tokio-util",
+ "tonic",
"tower",
"tracing",
"tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 74cc16d690..e6695c4246 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ nix = "0.26"
notify = "5.0.0"
num-traits = "0.2.15"
once_cell = "1.13"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
+tracing-opentelemetry = "0.18.0"
parking_lot = "0.12"
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -69,7 +73,7 @@ rand = "0.8"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
rustls = "0.20"
rustls-pemfile = "1"
rustls-split = "0.3"
@@ -107,9 +111,6 @@ x509-parser = "0.14"
env_logger = "0.10"
log = "0.4"
-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
+tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
## Common library dependency
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node
similarity index 86%
rename from Dockerfile.compute-node-v14
rename to Dockerfile.compute-node
index 2deb95a93f..936f368833 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node
@@ -1,8 +1,5 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
ARG TAG=pinned
#########################################################################################
@@ -22,7 +19,8 @@ RUN apt update && \
#
#########################################################################################
FROM build-deps AS pg-build
-COPY vendor/postgres-v14 postgres
+ARG PG_VERSION
+COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
@@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
+#########################################################################################
+#
+# Layer "unit-pg-build"
+# compile unit extension
+#
+#########################################################################################
+FROM build-deps AS unit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
+ tar xvzf 7.7.tar.gz && \
+ cd postgresql-unit-7.7 && \
+ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+ # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
+ # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
+ # This one-liner removes pgsql/ part of the path.
+ # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
+ find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
+ echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
+
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /h3/usr /
+COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
deleted file mode 100644
index 8647ce2bf4..0000000000
--- a/Dockerfile.compute-node-v15
+++ /dev/null
@@ -1,220 +0,0 @@
-#
-# This file is identical to the Dockerfile.compute-node-v14 file
-# except for the version of Postgres that is built.
-#
-
-ARG TAG=pinned
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update && \
- apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
- zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-COPY vendor/postgres-v15 postgres
-RUN cd postgres && \
- ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
- make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
- make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
- # Install headers
- make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
- make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
- # Enable some of contrib extensions
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
-
-#########################################################################################
-#
-# Layer "postgis-build"
-# Build PostGIS from the upstream PostGIS mirror.
-#
-#########################################################################################
-FROM build-deps AS postgis-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
- apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
-
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
- tar xvzf postgis-3.3.1.tar.gz && \
- cd postgis-3.3.1 && \
- ./autogen.sh && \
- export PATH="/usr/local/pgsql/bin:$PATH" && \
- ./configure && \
- make -j $(getconf _NPROCESSORS_ONLN) install && \
- cd extensions/postgis && \
- make clean && \
- make -j $(getconf _NPROCESSORS_ONLN) install && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
-
-#########################################################################################
-#
-# Layer "plv8-build"
-# Build plv8
-#
-#########################################################################################
-FROM build-deps AS plv8-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
- apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
-
-# https://github.com/plv8/plv8/issues/475:
-# v8 uses gold for linking and sets `--thread-count=4` which breaks
-# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
- tar xvzf binutils-2.38.tar.gz && \
- cd binutils-2.38 && \
- cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
- cd ../bfd && ./configure && make bfdver.h && \
- cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
- cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
- tar xvzf v3.1.4.tar.gz && \
- cd plv8-3.1.4 && \
- export PATH="/usr/local/pgsql/bin:$PATH" && \
- sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
- make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
- rm -rf /plv8-* && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
-
-#########################################################################################
-#
-# Layer "h3-pg-build"
-# Build h3_pg
-#
-#########################################################################################
-FROM build-deps AS h3-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
- -q -O /tmp/cmake-install.sh \
- && chmod u+x /tmp/cmake-install.sh \
- && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
- && rm /tmp/cmake-install.sh
-
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
- tar xvzf h3.tgz && \
- cd h3-4.0.1 && \
- mkdir build && \
- cd build && \
- cmake .. -DCMAKE_BUILD_TYPE=Release && \
- make -j $(getconf _NPROCESSORS_ONLN) && \
- DESTDIR=/h3 make install && \
- cp -R /h3/usr / && \
- rm -rf build
-
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
- tar xvzf h3-pg.tgz && \
- cd h3-pg-4.0.1 && \
- export PATH="/usr/local/pgsql/bin:$PATH" && \
- make -j $(getconf _NPROCESSORS_ONLN) && \
- make -j $(getconf _NPROCESSORS_ONLN) install && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
- echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
- PG_CONFIG=/usr/local/pgsql/bin/pg_config \
- -C pgxn/neon \
- -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
- echo "postgres:test_console_pass" | chpasswd && \
- mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
- chown -R postgres:postgres /var/db/postgres && \
- chmod 0750 /var/db/postgres/compute && \
- echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-RUN apt update && \
- apt install --no-install-recommends -y \
- libreadline8 \
- libossp-uuid16 \
- libgeos-c1v5 \
- libgdal28 \
- libproj19 \
- libprotobuf-c1 \
- gdb && \
- rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 4536604bdf..f8c3481f57 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
notify.workspace = true
+opentelemetry.workspace = true
postgres.workspace = true
regex.workspace = true
serde.workspace = true
@@ -19,7 +20,9 @@ tar.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tracing.workspace = true
+tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
+tracing-utils.workspace = true
url.workspace = true
workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e5ab8eb153..2c42662020 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -53,7 +53,7 @@ use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
- init_logger(DEFAULT_LOG_LEVEL)?;
+ init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let matches = cli().get_matches();
@@ -84,6 +84,29 @@ fn main() -> Result<()> {
}
};
+ // Extract OpenTelemetry context for the startup actions from the spec, and
+ // attach it to the current tracing context.
+ //
+ // This is used to propagate the context for the 'start_compute' operation
+ // from the neon control plane. This allows linking together the wider
+ // 'start_compute' operation that creates the compute container, with the
+ // startup actions here within the container.
+ //
+ // Switch to the startup context here, and exit it once the startup has
+ // completed and Postgres is up and running.
+ //
+ // NOTE: This is supposed to only cover the *startup* actions. Once
+ // postgres is configured and up-and-running, we exit this span. Any other
+ // actions that are performed on incoming HTTP requests, for example, are
+ // performed in separate spans.
+ let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+ use opentelemetry::propagation::TextMapPropagator;
+ use opentelemetry::sdk::propagation::TraceContextPropagator;
+ Some(TraceContextPropagator::new().extract(carrier).attach())
+ } else {
+ None
+ };
+
let pageserver_connstr = spec
.cluster
.settings
@@ -140,6 +163,9 @@ fn main() -> Result<()> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
if let Some(mut pg) = pg {
+ // Startup is finished, exit the startup tracing span
+ drop(startup_context_guard);
+
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -159,6 +185,10 @@ fn main() -> Result<()> {
info!("shutting down");
}
+ // Shutdown trace pipeline gracefully, so that it has a chance to send any
+ // pending traces before we exit.
+ tracing_utils::shutdown_tracing();
+
exit(exit_code.unwrap_or(1))
}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f2a49f332c..589a8e1434 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,16 +3,21 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
+use crate::compute::ComputeNode;
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use serde_json;
use tracing::{error, info};
-
-use crate::compute::ComputeNode;
+use tracing_utils::http::OtelName;
// Service function to handle all available routes.
-async fn routes(req: Request
, compute: Arc) -> Response {
+async fn routes(req: Request, compute: &Arc) -> Response {
+ //
+ // NOTE: The URI path is currently included in traces. That's OK because
+ // it doesn't contain any variable parts or sensitive information. But
+ // please keep that in mind if you change the routing here.
+ //
match (req.method(), req.uri().path()) {
// Serialized compute state.
(&Method::GET, "/status") => {
@@ -30,7 +35,7 @@ async fn routes(req: Request, compute: Arc) -> Response
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
- let res = crate::checker::check_writability(&compute).await;
+ let res = crate::checker::check_writability(compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
@@ -56,7 +61,19 @@ async fn serve(state: Arc) {
async move {
Ok::<_, Infallible>(service_fn(move |req: Request| {
let state = state.clone();
- async move { Ok::<_, Infallible>(routes(req, state).await) }
+ async move {
+ Ok::<_, Infallible>(
+ // NOTE: We include the URI path in the string. It
+ // doesn't contain any variable parts or sensitive
+ // information in this API.
+ tracing_utils::http::tracing_handler(
+ req,
+ |req| routes(req, &state),
+ OtelName::UriPath,
+ )
+ .await,
+ )
+ }
}))
}
});
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 57e5496e86..1b5cf647b0 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,21 +1,37 @@
-use anyhow::Result;
+use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
-/// Initialize `env_logger` using either `default_level` or
+/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
+///
+/// Logging is configured using either `default_log_level` or
/// `RUST_LOG` environment variable as default log level.
-pub fn init_logger(default_level: &str) -> Result<()> {
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the destination,
+/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
+/// `tracing-utils` package description.
+///
+pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+ // Initialize Logging
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
- .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
+ .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
let fmt_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_writer(std::io::stderr);
+ // Initialize OpenTelemetry
+ let otlp_layer =
+ tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+
+ // Put it all together
tracing_subscriber::registry()
.with(env_filter)
+ .with(otlp_layer)
.with(fmt_layer)
.init();
+ tracing::info!("logging and tracing started");
Ok(())
}
diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs
index 925a2f8ef3..0ce01ff478 100644
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+// To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+// as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+// (see below), then SCRAM-based authentication will automatically be chosen instead.
+// https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97cd623052..bbd0ec21ed 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
use std::path::Path;
use std::str::FromStr;
@@ -22,6 +23,8 @@ pub struct ComputeSpec {
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option>,
+
+ pub startup_tracing_context: Option>,
}
/// Cluster state seen from the perspective of the external tools
@@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
{
RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password {
- // Check whether password changed or not (trim 'md5:' prefix first)
- if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+ // Check whether password changed or not (trim 'md5' prefix first if any)
+ //
+ // This is a backward compatibility hack, which comes from the times when we were using
+ // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+ // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+ // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+ // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+ // to place this compatibility layer.
+ let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+ stripped
+ } else {
+ pg_pwd
+ };
+ if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
RoleAction::Update
} else {
RoleAction::None
@@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
name.pg_quote(),
db.owner.pg_quote()
);
- let _ = info_span!("executing", query).entered();
+ let _guard = info_span!("executing", query).entered();
client.execute(query.as_str(), &[])?;
}
DatabaseAction::Create => {
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
query.push_str(&db.to_pg_options());
- let _ = info_span!("executing", query).entered();
+ let _guard = info_span!("executing", query).entered();
client.execute(query.as_str(), &[])?;
}
};
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 880ab0e83c..07d220195b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -8,6 +8,7 @@ pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets};
+pub use prometheus::{register_counter_vec, Counter, CounterVec};
pub use prometheus::{register_gauge, Gauge};
pub use prometheus::{register_gauge_vec, GaugeVec};
pub use prometheus::{register_histogram, Histogram};
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b5027cb331..0d7aa2db55 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,6 +29,14 @@ pub enum TenantState {
Broken,
}
+pub mod state {
+ pub const LOADING: &str = "loading";
+ pub const ATTACHING: &str = "attaching";
+ pub const ACTIVE: &str = "active";
+ pub const STOPPING: &str = "stopping";
+ pub const BROKEN: &str = "broken";
+}
+
impl TenantState {
pub fn has_in_progress_downloads(&self) -> bool {
match self {
@@ -39,23 +47,32 @@ impl TenantState {
Self::Broken => false,
}
}
+
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ TenantState::Loading => state::LOADING,
+ TenantState::Attaching => state::ATTACHING,
+ TenantState::Active => state::ACTIVE,
+ TenantState::Stopping => state::STOPPING,
+ TenantState::Broken => state::BROKEN,
+ }
+ }
}
/// A state of a timeline in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TimelineState {
- /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
- /// background jobs are running otherwise they will be launched when the tenant is activated.
+ /// The timeline is recognized by the pageserver but is not yet operational.
+ /// In particular, the walreceiver connection loop is not running for this timeline.
+ /// It will eventually transition to state Active or Broken.
+ Loading,
+ /// The timeline is fully operational.
+ /// It can be queried, and the walreceiver connection loop is running.
Active,
- /// A timeline is recognized by pageserver, but not yet ready to operate.
- /// The status indicates, that the timeline could eventually go back to Active automatically:
- /// for example, if the owning tenant goes back to Active again.
- Suspended,
- /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
- /// automatically become Active after certain events: only a management call can change this status.
+ /// The timeline was previously Loading or Active but is shutting down.
+ /// It cannot transition back into any other state.
Stopping,
- /// A timeline is recognized by the pageserver, but can no longer be used for
- /// any operations, because it failed to be activated.
+ /// The timeline is broken and not operational (previous states: Loading or Active).
Broken,
}
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
new file mode 100644
index 0000000000..8c3d3f9063
--- /dev/null
+++ b/libs/tracing-utils/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tracing-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
new file mode 100644
index 0000000000..3f80f49de1
--- /dev/null
+++ b/libs/tracing-utils/src/http.rs
@@ -0,0 +1,96 @@
+//! Tracing wrapper for Hyper HTTP server
+
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
+use std::future::Future;
+use tracing::Instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
+
+/// Configuration option for what to use as the "otel.name" field in the traces.
+pub enum OtelName<'a> {
+ /// Use a constant string
+ Constant(&'a str),
+
+ /// Use the path from the request.
+ ///
+ /// That's very useful information, but is not appropriate if the
+ /// path contains parameters that differ on ever request, or worse,
+ /// sensitive information like usernames or email addresses.
+ ///
+ /// See
+ UriPath,
+}
+
+/// Handle an incoming HTTP request using the given handler function,
+/// with OpenTelemetry tracing.
+///
+/// This runs 'handler' on the request in a new span, with fields filled in
+/// from the request. Notably, if the request contains tracing information,
+/// it is propagated to the span, so that this request is traced as part of
+/// the same trace.
+///
+/// XXX: Usually, this is handled by existing libraries, or built
+/// directly into HTTP servers. However, I couldn't find one for Hyper,
+/// so I had to write our own. OpenTelemetry website has a registry of
+/// instrumentation libraries at:
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// If a Hyper crate appears, consider switching to that.
+pub async fn tracing_handler(
+ req: Request,
+ handler: F,
+ otel_name: OtelName<'_>,
+) -> Response
+where
+ F: Fn(Request) -> R,
+ R: Future>,
+{
+ // Create a tracing span, with context propagated from the incoming
+ // request if any.
+ //
+ // See list of standard fields defined for HTTP requests at
+ // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
+ // We only fill in a few of the most useful ones here.
+ let otel_name = match otel_name {
+ OtelName::Constant(s) => s,
+ OtelName::UriPath => req.uri().path(),
+ };
+
+ let span = tracing::info_span!(
+ "http request",
+ otel.name= %otel_name,
+ http.method = %req.method(),
+ http.status_code = tracing::field::Empty,
+ );
+ let parent_ctx = extract_remote_context(req.headers());
+ span.set_parent(parent_ctx);
+
+ // Handle the request within the span
+ let response = handler(req).instrument(span.clone()).await;
+
+ // Fill in the fields from the response code
+ let status = response.status();
+ span.record("http.status_code", status.as_str());
+ span.record(
+ "otel.status_code",
+ if status.is_success() { "OK" } else { "ERROR" },
+ );
+
+ response
+}
+
+// Extract remote tracing context from the HTTP headers
+fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
+ struct HeaderExtractor<'a>(&'a HeaderMap);
+
+ impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+ fn get(&self, key: &str) -> Option<&str> {
+ self.0.get(key).and_then(|value| value.to_str().ok())
+ }
+
+ fn keys(&self) -> Vec<&str> {
+ self.0.keys().map(|value| value.as_str()).collect()
+ }
+ }
+ let extractor = HeaderExtractor(headers);
+ opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
+}
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
new file mode 100644
index 0000000000..de0e2ad799
--- /dev/null
+++ b/libs/tracing-utils/src/lib.rs
@@ -0,0 +1,168 @@
+//! Helper functions to set up OpenTelemetry tracing.
+//!
+//! This comes in two variants, depending on whether you have a Tokio runtime available.
+//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
+//! the current tokio runtime. If you don't have a runtime available, or you don't want
+//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
+//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
+//!
+//! Example:
+//!
+//! ```rust,no_run
+//! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
+//!
+//! #[tokio::main]
+//! async fn main() {
+//! // Set up logging to stderr
+//! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+//! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+//! let fmt_layer = tracing_subscriber::fmt::layer()
+//! .with_target(false)
+//! .with_writer(std::io::stderr);
+//!
+//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
+//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!
+//! // Put it all together
+//! tracing_subscriber::registry()
+//! .with(env_filter)
+//! .with(otlp_layer)
+//! .with(fmt_layer)
+//! .init();
+//! }
+//! ```
+
+use opentelemetry::sdk::Resource;
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;
+
+/// Set up OpenTelemetry exporter, using configuration from environment variables.
+///
+/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
+/// )
+///
+/// We try to follow the conventions for the environment variables specified in
+///
+///
+/// However, we only support a subset of those options:
+///
+/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
+/// is enabled by default. Set it to "true" to disable.
+///
+/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
+/// settings specified in
+///
+/// are supported, as they are handled by the `opentelemetry-otlp` crate.
+/// Settings related to other exporters have no effect.
+///
+/// - Some other settings are supported by the `opentelemetry` crate.
+///
+/// If you need some other setting, please test if it works first. And perhaps
+/// add a comment in the list above to save the effort of testing for the next
+/// person.
+///
+/// This doesn't block, but is marked as 'async' to hint that this must be called in
+/// asynchronous execution context.
+pub async fn init_tracing(service_name: &str) -> Option {
+ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+ return None;
+ };
+ Some(init_tracing_internal(service_name.to_string()))
+}
+
+/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
+/// tasks.
+pub fn init_tracing_without_runtime(
+ service_name: &str,
+) -> Option {
+ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+ return None;
+ };
+
+ // The opentelemetry batch processor and the OTLP exporter needs a Tokio
+ // runtime. Create a dedicated runtime for them. One thread should be
+ // enough.
+ //
+ // (Alternatively, instead of batching, we could use the "simple
+ // processor", which doesn't need Tokio, and use "reqwest-blocking"
+ // feature for the OTLP exporter, which also doesn't need Tokio. However,
+ // batching is considered best practice, and also I have the feeling that
+ // the non-Tokio codepaths in the opentelemetry crate are less used and
+ // might be more buggy, so better to stay on the well-beaten path.)
+ //
+ // We leak the runtime so that it keeps running after we exit the
+ // function.
+ let runtime = Box::leak(Box::new(
+ tokio::runtime::Builder::new_multi_thread()
+ .enable_all()
+ .thread_name("otlp runtime thread")
+ .worker_threads(1)
+ .build()
+ .unwrap(),
+ ));
+ let _guard = runtime.enter();
+
+ Some(init_tracing_internal(service_name.to_string()))
+}
+
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+ // Set up exporter from the OTEL_EXPORTER_* environment variables
+ let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+
+ // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+ // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+ // OpenTelemetry spec at
+ // ,
+ // the full exporter URL is formed by appending "/v1/traces" to the value
+ // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+ // that with the grpc-tonic exporter. Other exporters, like the HTTP
+ // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+ // appending "/v1/traces".
+ //
+ // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+ //
+ // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+ // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+ // fixed in a later version, we can remove this code. But if we don't
+ // remember to remove this, it won't do any harm either, as the crate will
+ // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+ // is set directly with `with_endpoint`.
+ if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+ if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+ if !endpoint.ends_with('/') {
+ endpoint.push('/');
+ }
+ endpoint.push_str("v1/traces");
+ exporter = exporter.with_endpoint(endpoint);
+ }
+ }
+
+ // Propagate trace information in the standard W3C TraceContext format.
+ opentelemetry::global::set_text_map_propagator(
+ opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+ );
+
+ opentelemetry_otlp::new_pipeline()
+ .tracing()
+ .with_exporter(exporter)
+ .with_trace_config(
+ opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+ opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+ service_name,
+ )])),
+ )
+ .install_batch(opentelemetry::runtime::Tokio)
+ .expect("could not initialize opentelemetry exporter")
+}
+
+// Shutdown trace pipeline gracefully, so that it has a chance to send any
+// pending traces before we exit.
+pub fn shutdown_tracing() {
+ opentelemetry::global::shutdown_tracer_provider();
+}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 020e4d9dd7..1f6c96bdbe 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
license.workspace = true
[dependencies]
+atty.workspace = true
sentry.workspace = true
async-trait.workspace = true
anyhow.workspace = true
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index b0ecb746d9..1ba0422993 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,7 @@
use hyper::{header, Body, Response, StatusCode};
use serde::{Deserialize, Serialize};
use thiserror::Error;
+use tracing::error;
#[derive(Debug, Error)]
pub enum ApiError {
@@ -76,8 +77,16 @@ impl HttpErrorBody {
}
pub async fn handler(err: routerify::RouteError) -> Response {
- tracing::error!("Error processing HTTP request: {:?}", err);
- err.downcast::()
- .expect("handler should always return api error")
- .into_response()
+ let api_error = err
+ .downcast::()
+ .expect("handler should always return api error");
+
+ // Print a stack trace for Internal Server errors
+ if let ApiError::InternalServerError(_) = api_error.as_ref() {
+ error!("Error processing HTTP request: {api_error:?}");
+ } else {
+ error!("Error processing HTTP request: {api_error:#}");
+ }
+
+ api_error.into_response()
}
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 82c9267f4a..02684d3d16 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
let base_logger = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_target(false)
- .with_ansi(false)
+ .with_ansi(atty::is(atty::Stream::Stdout))
.with_writer(std::io::stdout);
match log_format {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cb9e4478bf..66c25e8576 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
testing = ["fail/failpoints"]
[dependencies]
-amplify_num.workspace = true
anyhow.workspace = true
async-stream.workspace = true
async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
postgres-types.workspace = true
rand.workspace = true
regex.workspace = true
-rstar.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,7 @@ tenant_size_model.workspace = true
utils.workspace = true
workspace_hack.workspace = true
reqwest.workspace = true
+rpds.workspace = true
[dev-dependencies]
criterion.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6a01fdfc6f..e18c00da96 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
use std::fs::File;
use std::io::{BufRead, BufReader};
-use std::ops::Range;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
use criterion::{criterion_group, criterion_main, Criterion};
-struct DummyDelta {
- key_range: Range,
- lsn_range: Range,
-}
-
-impl Layer for DummyDelta {
- fn get_key_range(&self) -> Range {
- self.key_range.clone()
- }
-
- fn get_lsn_range(&self) -> Range {
- self.lsn_range.clone()
- }
- fn get_value_reconstruct_data(
- &self,
- _key: Key,
- _lsn_range: Range,
- _reconstruct_data: &mut ValueReconstructState,
- ) -> Result {
- panic!()
- }
-
- fn is_incremental(&self) -> bool {
- true
- }
-
- fn dump(&self, _verbose: bool) -> Result<()> {
- unimplemented!()
- }
-
- fn short_id(&self) -> String {
- unimplemented!()
- }
-}
-
-struct DummyImage {
- key_range: Range,
- lsn: Lsn,
-}
-
-impl Layer for DummyImage {
- fn get_key_range(&self) -> Range {
- self.key_range.clone()
- }
-
- fn get_lsn_range(&self) -> Range {
- // End-bound is exclusive
- self.lsn..(self.lsn + 1)
- }
-
- fn get_value_reconstruct_data(
- &self,
- _key: Key,
- _lsn_range: Range,
- _reconstruct_data: &mut ValueReconstructState,
- ) -> Result {
- panic!()
- }
-
- fn is_incremental(&self) -> bool {
- false
- }
-
- fn dump(&self, _verbose: bool) -> Result<()> {
- unimplemented!()
- }
-
- fn short_id(&self) -> String {
- unimplemented!()
- }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
- let mut layer_map = LayerMap::::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
+ let mut layer_map = LayerMap::::default();
let mut min_lsn = Lsn(u64::MAX);
let mut max_lsn = Lsn(0);
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
+ let mut updates = layer_map.batch_update();
for fname in filenames {
let fname = &fname.unwrap();
if let Some(imgfilename) = ImageFileName::parse_str(fname) {
- let layer = DummyImage {
- key_range: imgfilename.key_range,
- lsn: imgfilename.lsn,
+ let layer = LayerDescriptor {
+ key: imgfilename.key_range,
+ lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+ is_incremental: false,
+ short_id: fname.to_string(),
};
- layer_map.insert_historic(Arc::new(layer));
+ updates.insert_historic(Arc::new(layer));
min_lsn = min(min_lsn, imgfilename.lsn);
max_lsn = max(max_lsn, imgfilename.lsn);
} else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
- let layer = DummyDelta {
- key_range: deltafilename.key_range,
- lsn_range: deltafilename.lsn_range.clone(),
+ let layer = LayerDescriptor {
+ key: deltafilename.key_range.clone(),
+ lsn: deltafilename.lsn_range.clone(),
+ is_incremental: true,
+ short_id: fname.to_string(),
};
- layer_map.insert_historic(Arc::new(layer));
+ updates.insert_historic(Arc::new(layer));
min_lsn = min(min_lsn, deltafilename.lsn_range.start);
max_lsn = max(max_lsn, deltafilename.lsn_range.end);
} else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
println!("min: {min_lsn}, max: {max_lsn}");
+ updates.flush();
layer_map
}
/// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
// For each image layer we query one of the pages contained, at LSN right
// before the image layer was created. This gives us a somewhat uniform
// coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
.collect()
}
+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
+ let mut parts = Vec::new();
+
+ // We add a partition boundary at the start of each image layer,
+ // no matter what lsn range it covers. This is just the easiest
+ // thing to do. A better thing to do would be to get a real
+ // partitioning from some database. Even better, remove the need
+ // for key partitions by deciding where to create image layers
+ // directly based on a coverage-based difficulty map.
+ let mut keys: Vec<_> = layer_map
+ .iter_historic_layers()
+ .filter_map(|l| {
+ if l.is_incremental() {
+ None
+ } else {
+ let kr = l.get_key_range();
+ Some(kr.start.next())
+ }
+ })
+ .collect();
+ keys.sort();
+
+ let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+ for key in keys {
+ parts.push(KeySpace {
+ ranges: vec![current_key..key],
+ });
+ current_key = key;
+ }
+
+ KeyPartitioning { parts }
+}
+
// Benchmark using metadata extracted from our performance test environment, from
// a project where we have run pgbench many timmes. The pgbench database was initialized
// between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
// Benchmark using metadata extracted from a real project that was taknig
// too long processing layer map queries.
fn bench_from_real_project(c: &mut Criterion) {
- // TODO consider compressing this file
+ // Init layer map
+ let now = Instant::now();
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+ println!("Finished layer map init in {:?}", now.elapsed());
+
+ // Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
- // Test with uniform query pattern
- c.bench_function("real_map_uniform_queries", |b| {
+ // Choose inputs for get_difficulty_map
+ let latest_lsn = layer_map
+ .iter_historic_layers()
+ .map(|l| l.get_lsn_range().end)
+ .max()
+ .unwrap();
+ let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+ // Check correctness of get_difficulty_map
+ // TODO put this in a dedicated test outside of this mod
+ {
+ println!("running correctness check");
+
+ let now = Instant::now();
+ let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+ assert!(result_bruteforce.len() == partitioning.parts.len());
+ println!("Finished bruteforce in {:?}", now.elapsed());
+
+ let now = Instant::now();
+ let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+ assert!(result_fast.len() == partitioning.parts.len());
+ println!("Finished fast in {:?}", now.elapsed());
+
+ // Assert results are equal. Manually iterate for easier debugging.
+ let zip = std::iter::zip(
+ &partitioning.parts,
+ std::iter::zip(result_bruteforce, result_fast),
+ );
+ for (_part, (bruteforce, fast)) in zip {
+ assert_eq!(bruteforce, fast);
+ }
+
+ println!("No issues found");
+ }
+
+ // Define and name the benchmark function
+ let mut group = c.benchmark_group("real_map");
+ group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1);
}
});
});
+ group.bench_function("get_difficulty_map", |b| {
+ b.iter(|| {
+ layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+ });
+ });
+ group.finish();
}
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_sequential(c: &mut Criterion) {
- let mut layer_map: LayerMap = LayerMap::default();
-
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
//
// TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
// Putting it inside the `bench_function` closure is not a solution
// because then it runs multiple times during warmup.
let now = Instant::now();
+ let mut layer_map = LayerMap::default();
+ let mut updates = layer_map.batch_update();
for i in 0..100_000 {
- // TODO try inserting a super-wide layer in between every 10 to reflect
- // what often happens with L1 layers that include non-rel changes.
- // Maybe do that as a separate test.
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
- let layer = DummyImage {
- key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
- lsn: Lsn(10 * i),
+ let layer = LayerDescriptor {
+ key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+ lsn: Lsn(i)..Lsn(i + 1),
+ is_incremental: false,
+ short_id: format!("Layer {}", i),
};
- layer_map.insert_historic(Arc::new(layer));
+ updates.insert_historic(Arc::new(layer));
}
-
- // Manually measure runtime without criterion because criterion
- // has a minimum sample size of 10 and I don't want to run it 10 times.
- println!("Finished init in {:?}", now.elapsed());
+ updates.flush();
+ println!("Finished layer map init in {:?}", now.elapsed());
// Choose 100 uniformly random queries
let rng = &mut StdRng::seed_from_u64(1);
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
- .choose_multiple(rng, 1)
+ .choose_multiple(rng, 100)
.copied()
.collect();
// Define and name the benchmark function
- c.bench_function("sequential_uniform_queries", |b| {
- // Run the search queries
+ let mut group = c.benchmark_group("sequential");
+ group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1);
}
});
});
+ group.finish();
}
criterion_group!(group_1, bench_from_captest_env);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index f1d92ac36b..06d4853274 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -27,6 +27,7 @@ use tracing::*;
///
use tokio_tar::{Builder, EntryType, Header};
+use crate::context::RequestContext;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
req_lsn: Option,
prev_lsn: Option,
full_backup: bool,
+ ctx: &'a RequestContext,
) -> anyhow::Result<()>
where
W: AsyncWrite + Send + Sync + Unpin,
@@ -110,6 +112,7 @@ where
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
+ ctx,
};
basebackup
.send_tarball()
@@ -129,6 +132,7 @@ where
lsn: Lsn,
prev_record_lsn: Lsn,
full_backup: bool,
+ ctx: &'a RequestContext,
}
impl<'a, W> Basebackup<'a, W>
@@ -171,23 +175,37 @@ where
SlruKind::MultiXactOffsets,
SlruKind::MultiXactMembers,
] {
- for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
+ for segno in self
+ .timeline
+ .list_slru_segments(kind, self.lsn, self.ctx)
+ .await?
+ {
self.add_slru_segment(kind, segno).await?;
}
}
// Create tablespace directories
- for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
+ for ((spcnode, dbnode), has_relmap_file) in
+ self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+ {
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
// Gather and send relational files in each database if full backup is requested.
if self.full_backup {
- for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
+ for rel in self
+ .timeline
+ .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+ .await?
+ {
self.add_rel(rel).await?;
}
}
}
- for xid in self.timeline.list_twophase_files(self.lsn).await? {
+ for xid in self
+ .timeline
+ .list_twophase_files(self.lsn, self.ctx)
+ .await?
+ {
self.add_twophase_file(xid).await?;
}
@@ -203,7 +221,10 @@ where
}
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
- let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
+ let nblocks = self
+ .timeline
+ .get_rel_size(tag, self.lsn, false, self.ctx)
+ .await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
@@ -223,7 +244,7 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
- .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+ .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -245,14 +266,14 @@ where
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self
.timeline
- .get_slru_segment_size(slru, segno, self.lsn)
+ .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
.await?;
let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
for blknum in 0..nblocks {
let img = self
.timeline
- .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+ .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
.await?;
if slru == SlruKind::Clog {
@@ -287,7 +308,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
- .get_relmap_file(spcnode, dbnode, self.lsn)
+ .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
.await?;
ensure!(img.len() == 512);
Some(img)
@@ -323,7 +344,7 @@ where
if !has_relmap_file
&& self
.timeline
- .list_rels(spcnode, dbnode, self.lsn)
+ .list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?
.is_empty()
{
@@ -356,7 +377,10 @@ where
// Extract twophase state files
//
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
- let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
+ let img = self
+ .timeline
+ .get_twophase_file(xid, self.lsn, self.ctx)
+ .await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -394,12 +418,12 @@ where
let checkpoint_bytes = self
.timeline
- .get_checkpoint(self.lsn)
+ .get_checkpoint(self.lsn, self.ctx)
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
- .get_control_file(self.lsn)
+ .get_control_file(self.lsn, self.ctx)
.await
.context("failed get control bytes")?;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5de6e4def5..f2cd93bd3a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,6 +13,7 @@ use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
+ context::{DownloadBehavior, RequestContext},
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{
@@ -26,7 +27,7 @@ use utils::{
logging,
postgres_backend::AuthType,
project_git_version,
- sentry_init::{init_sentry, release_name},
+ sentry_init::init_sentry,
signals::{self, Signal},
tcp_listener,
};
@@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> {
};
// initialize sentry if SENTRY_DSN is provided
- let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+ let _sentry_guard = init_sentry(
+ Some(GIT_VERSION.into()),
+ &[("node_id", &conf.id.to_string())],
+ );
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
@@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
let signals = signals::install_shutdown_handlers()?;
// Launch broker client
- WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
+ WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
// Initialize authentication for incoming connections
let auth = match &conf.auth_type {
@@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
);
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+ let metrics_ctx = RequestContext::todo_child(
+ TaskKind::MetricsCollection,
+ // This task itself shouldn't download anything.
+ // The actual size calculation does need downloads, and
+ // creates a child context with the right DownloadBehavior.
+ DownloadBehavior::Error,
+ );
task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::MetricsCollection,
@@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
conf.metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
+ metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
@@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection. We created the listener earlier already.
- task_mgr::spawn(
- COMPUTE_REQUEST_RUNTIME.handle(),
- TaskKind::LibpqEndpointListener,
- None,
- None,
- "libpq endpoint listener",
- true,
- async move {
- page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
- },
- );
+ {
+ let libpq_ctx = RequestContext::todo_child(
+ TaskKind::LibpqEndpointListener,
+ // listener task shouldn't need to download anything. (We will
+ // create a separate sub-contexts for each connection, with their
+ // own download behavior. This context is used only to listen and
+ // accept connections.)
+ DownloadBehavior::Error,
+ );
+ task_mgr::spawn(
+ COMPUTE_REQUEST_RUNTIME.handle(),
+ TaskKind::LibpqEndpointListener,
+ None,
+ None,
+ "libpq endpoint listener",
+ true,
+ async move {
+ page_service::libpq_listener_main(
+ conf,
+ auth,
+ pageserver_listener,
+ conf.auth_type,
+ libpq_ctx,
+ )
+ .await
+ },
+ );
+ }
// All started up! Now just sit and wait for shutdown signal.
signals.handle(|signal| match signal {
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
new file mode 100644
index 0000000000..6c92967ca3
--- /dev/null
+++ b/pageserver/src/broker_client.rs
@@ -0,0 +1,48 @@
+//! The broker client instance of the pageserver, created during pageserver startup.
+//! Used by each timelines' [`walreceiver`].
+
+use crate::config::PageServerConf;
+
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+use storage_broker::BrokerClientChannel;
+use tracing::*;
+
+static BROKER_CLIENT: OnceCell = OnceCell::new();
+
+///
+/// Initialize the broker client. This must be called once at page server startup.
+///
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+ let broker_endpoint = conf.broker_endpoint.clone();
+
+ // Note: we do not attempt connecting here (but validate endpoints sanity).
+ let broker_client =
+ storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+ format!(
+ "Failed to create broker client to {}",
+ &conf.broker_endpoint
+ ),
+ )?;
+
+ if BROKER_CLIENT.set(broker_client).is_err() {
+ panic!("broker already initialized");
+ }
+
+ info!(
+ "Initialized broker client with endpoints: {}",
+ broker_endpoint
+ );
+ Ok(())
+}
+
+///
+/// Get a handle to the broker client
+///
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+ BROKER_CLIENT.get().expect("broker client not initialized")
+}
+
+pub fn is_broker_client_initialized() -> bool {
+ BROKER_CLIENT.get().is_some()
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..a3b051279d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -158,6 +158,8 @@ pub struct PageServerConf {
pub synthetic_size_calculation_interval: Duration,
pub test_remote_failures: u64,
+
+ pub ondemand_download_behavior_treat_error_as_warn: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +224,8 @@ struct PageServerConfigBuilder {
synthetic_size_calculation_interval: BuilderValue,
test_remote_failures: BuilderValue,
+
+ ondemand_download_behavior_treat_error_as_warn: BuilderValue,
}
impl Default for PageServerConfigBuilder {
@@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder {
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
test_remote_failures: Set(0),
+
+ ondemand_download_behavior_treat_error_as_warn: Set(false),
}
}
}
@@ -363,6 +369,14 @@ impl PageServerConfigBuilder {
self.test_remote_failures = BuilderValue::Set(fail_first);
}
+ pub fn ondemand_download_behavior_treat_error_as_warn(
+ &mut self,
+ ondemand_download_behavior_treat_error_as_warn: bool,
+ ) {
+ self.ondemand_download_behavior_treat_error_as_warn =
+ BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
+ }
+
pub fn build(self) -> anyhow::Result {
Ok(PageServerConf {
listen_pg_addr: self
@@ -422,6 +436,11 @@ impl PageServerConfigBuilder {
test_remote_failures: self
.test_remote_failures
.ok_or(anyhow!("missing test_remote_failuers"))?,
+ ondemand_download_behavior_treat_error_as_warn: self
+ .ondemand_download_behavior_treat_error_as_warn
+ .ok_or(anyhow!(
+ "missing ondemand_download_behavior_treat_error_as_warn"
+ ))?,
})
}
}
@@ -600,6 +619,7 @@ impl PageServerConf {
"synthetic_size_calculation_interval" =>
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+ "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -724,6 +744,7 @@ impl PageServerConf {
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
synthetic_size_calculation_interval: Duration::from_secs(60),
test_remote_failures: 0,
+ ondemand_download_behavior_treat_error_as_warn: false,
}
}
}
@@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result {
Ok(i as u64)
}
+fn parse_toml_bool(name: &str, item: &Item) -> Result {
+ item.as_bool()
+ .with_context(|| format!("configure option {name} is not a bool"))
+}
+
fn parse_toml_duration(name: &str, item: &Item) -> Result {
let s = item
.as_str()
@@ -907,6 +933,7 @@ log_format = 'json'
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
)?,
test_remote_failures: 0,
+ ondemand_download_behavior_treat_error_as_warn: false,
},
"Correct defaults should be used when no config values are provided"
);
@@ -954,6 +981,7 @@ log_format = 'json'
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
synthetic_size_calculation_interval: Duration::from_secs(333),
test_remote_failures: 0,
+ ondemand_download_behavior_treat_error_as_warn: false,
},
"Should be able to parse all basic config values correctly"
);
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c07026261d..d848ec5ee5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,6 +3,7 @@
//! and push them to a HTTP endpoint.
//! Cache metrics to send only the updated ones.
//!
+use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::mgr;
use anyhow;
@@ -47,12 +48,15 @@ pub async fn collect_metrics(
metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
+ ctx: RequestContext,
) -> anyhow::Result<()> {
let mut ticker = tokio::time::interval(metric_collection_interval);
info!("starting collect_metrics");
// spin up background worker that caclulates tenant sizes
+ let worker_ctx =
+ ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::CalculateSyntheticSize,
@@ -61,7 +65,7 @@ pub async fn collect_metrics(
"synthetic size calculation",
false,
async move {
- calculate_synthetic_size_worker(synthetic_size_calculation_interval)
+ calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
@@ -79,7 +83,7 @@ pub async fn collect_metrics(
return Ok(());
},
_ = ticker.tick() => {
- if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
+ if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
{
error!("metrics collection failed: {err:?}");
}
@@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration(
cached_metrics: &mut HashMap,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
trace!(
@@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration(
);
// get list of tenants
- let tenants = mgr::list_tenants().await;
+ let tenants = mgr::list_tenants().await?;
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
@@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration(
timeline_written_size,
));
- let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+ let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
// Only send timeline logical size when it is fully calculated.
if is_exact {
current_metrics.push((
@@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration(
/// Caclculate synthetic size for each active tenant
pub async fn calculate_synthetic_size_worker(
synthetic_size_calculation_interval: Duration,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
@@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker(
},
_ = ticker.tick() => {
- let tenants = mgr::list_tenants().await;
+ let tenants = match mgr::list_tenants().await {
+ Ok(tenants) => tenants,
+ Err(e) => {
+ warn!("cannot get tenant list: {e:#}");
+ continue;
+ }
+ };
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
@@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker(
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
{
- if let Err(e) = tenant.calculate_synthetic_size().await {
+ if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
}
}
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
new file mode 100644
index 0000000000..e826d28e6d
--- /dev/null
+++ b/pageserver/src/context.rs
@@ -0,0 +1,199 @@
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//!
+//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
+//! 1. What high-level activity ([`TaskKind`]) needs this page?
+//! We need that information as a categorical dimension for page access
+//! statistics, which we, in turn, need to guide layer eviction policy design.
+//! 2. How should we behave if, to produce the page image, we need to
+//! on-demand download a layer file ([`DownloadBehavior`]).
+//!
+//! [`RequestContext`] satisfies those needs.
+//! The current implementation is a small `struct` that is passed through
+//! the call chain by reference.
+//!
+//! ### Future Work
+//!
+//! However, we do not intend to stop here, since there are other needs that
+//! require carrying information from high to low levels of the app.
+//!
+//! Most importantly, **cancellation signaling** in response to
+//! 1. timeouts (page_service max response time) and
+//! 2. lifecycle requests (detach tenant, delete timeline).
+//!
+//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
+//! by the transitive callees of a request have finished. The keyword here
+//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
+//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
+//! or `JoinSet` in other places.
+//!
+//! We do not yet have a systematic cancellation story in pageserver, and it is
+//! pretty clear that [`RequestContext`] will be responsible for that.
+//! So, the API already prepares for this role through the
+//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`] methods.
+//! See their doc comments for details on how we will use them in the future.
+//!
+//! It is not clear whether or how we will enforce Structured Concurrency, and
+//! what role [`RequestContext`] will play there.
+//! So, the API doesn't prepare us for this topic.
+//!
+//! Other future uses of `RequestContext`:
+//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
+//! - Request IDs for distributed tracing
+//! - Request/Timeline/Tenant-scoped log levels
+//!
+//! RequestContext might look quite different once it supports those features.
+//! Likely, it will have a shape similar to Golang's `context.Context`.
+//!
+//! ### Why A Struct Instead Of Method Parameters
+//!
+//! What's typical about such information is that it needs to be passed down
+//! along the call chain from high level to low level, but few of the functions
+//! in the middle need to understand it.
+//! Further, it is to be expected that we will need to propagate more data
+//! in the future (see the earlier section on future work).
+//! Hence, for functions in the middle of the call chain, we have the following
+//! requirements:
+//! 1. It should be easy to forward the context to callees.
+//! 2. To propagate more data from high-level to low-level code, the functions in
+//! the middle should not need to be modified.
+//! The solution is to have a container structure ([`RequestContext`]) that
+//! carries the information. Functions that don't care about what's in it
+//! pass it along to callees.
+//!
+//! ### Why Not Task-Local Variables
+//!
+//! One could use task-local variables (the equivalent of thread-local variables)
+//! to address the immediate needs outlined above.
+//! However, we reject task-local variables because:
+//! 1. they are implicit, thereby making it harder to trace the data flow in code
+//! reviews and during debugging,
+//! 2. they can be mutable, which enables implicit return data flow,
+//! 3. they are restrictive in that code which fans out into multiple tasks,
+//! or even threads, needs to carefully propagate the state.
+//!
+//! In contrast, information flow with [`RequestContext`] is
+//! 1. always explicit,
+//! 2. strictly uni-directional because RequestContext is immutable,
+//! 3. tangible because a [`RequestContext`] is just a value.
+//! When creating child activities, regardless of whether it's a task,
+//! thread, or even an RPC to another service, the value can
+//! be used like any other argument.
+//!
+//! The solution is that all code paths are infected with precisely one
+//! [`RequestContext`] argument. Functions in the middle of the call chain
+//! only need to pass it on.
+use crate::task_mgr::TaskKind;
+
+// The main structure of this module, see module-level comment.
+pub struct RequestContext {
+ task_kind: TaskKind,
+ download_behavior: DownloadBehavior,
+}
+
+/// Desired behavior if the operation requires an on-demand download
+/// to proceed.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum DownloadBehavior {
+ /// Download the layer file. It can take a while.
+ Download,
+
+ /// Download the layer file, but print a warning to the log. This should be used
+ /// in code where the layer file is expected to already exist locally.
+ Warn,
+
+ /// Return a PageReconstructError::NeedsDownload error
+ Error,
+}
+
+impl RequestContext {
+ /// Create a new RequestContext that has no parent.
+ ///
+ /// The function is called `new` because, once we add children
+ /// to it using `detached_child` or `attached_child`, the context
+ /// form a tree (not implemented yet since cancellation will be
+ /// the first feature that requires a tree).
+ ///
+ /// # Future: Cancellation
+ ///
+ /// The only reason why a context like this one can be canceled is
+ /// because someone explicitly canceled it.
+ /// It has no parent, so it cannot inherit cancellation from there.
+ pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+ RequestContext {
+ task_kind,
+ download_behavior,
+ }
+ }
+
+ /// Create a detached child context for a task that may outlive `self`.
+ ///
+ /// Use this when spawning new background activity that should complete
+ /// even if the current request is canceled.
+ ///
+ /// # Future: Cancellation
+ ///
+ /// Cancellation of `self` will not propagate to the child context returned
+ /// by this method.
+ ///
+ /// # Future: Structured Concurrency
+ ///
+ /// We could add the Future as a parameter to this function, spawn it as a task,
+ /// and pass to the new task the child context as an argument.
+ /// That would be an ergonomic improvement.
+ ///
+ /// We could make new calls to this function fail if `self` is already canceled.
+ pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+ self.child_impl(task_kind, download_behavior)
+ }
+
+ /// Create a child of context `self` for a task that shall not outlive `self`.
+ ///
+ /// Use this when fanning-out work to other async tasks.
+ ///
+ /// # Future: Cancellation
+ ///
+ /// Cancelling a context will propagate to its attached children.
+ ///
+ /// # Future: Structured Concurrency
+ ///
+ /// We could add the Future as a parameter to this function, spawn it as a task,
+ /// and track its `JoinHandle` inside the `RequestContext`.
+ ///
+ /// We could then provide another method to allow waiting for all child tasks
+ /// to finish.
+ ///
+ /// We could make new calls to this function fail if `self` is already canceled.
+ /// Alternatively, we could allow the creation but not spawn the task.
+ /// The method to wait for child tasks would return an error, indicating
+ /// that the child task was not started because the context was canceled.
+ pub fn attached_child(&self) -> Self {
+ self.child_impl(self.task_kind(), self.download_behavior())
+ }
+
+ /// Use this function when you should be creating a child context using
+ /// [`attached_child`] or [`detached_child`], but your caller doesn't provide
+ /// a context and you are unwilling to change all callers to provide one.
+ ///
+ /// Before we add cancellation, we should get rid of this method.
+ pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+ Self::new(task_kind, download_behavior)
+ }
+
+ fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+ RequestContext {
+ task_kind,
+ download_behavior,
+ }
+ }
+
+ pub fn task_kind(&self) -> TaskKind {
+ self.task_kind
+ }
+
+ pub fn download_behavior(&self) -> DownloadBehavior {
+ self.download_behavior
+ }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index f9b8a81dad..23faff7ace 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
schema:
type: string
format: hex
+ - name: inputs_only
+ in: query
+ required: false
+ schema:
+ type: boolean
+ description: |
+ When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
get:
description: |
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
format: hex
size:
type: integer
+ nullable: true
description: |
- Size metric in bytes.
+ Size metric in bytes or null if inputs_only=true was given.
"401":
description: Unauthorized Error
content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1eb24c1507..a7802f3cbe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,11 @@ use super::models::{
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineInfo,
};
+use crate::context::{DownloadBehavior, RequestContext};
use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
+use crate::tenant::mgr::TenantMapInsertError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::{config::PageServerConf, tenant::mgr};
use utils::{
@@ -81,18 +84,39 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res
fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
match err {
PageReconstructError::Other(err) => ApiError::InternalServerError(err),
+ PageReconstructError::NeedsDownload(_, _) => {
+ // This shouldn't happen, because we use a RequestContext that requests to
+ // download any missing layer files on-demand.
+ ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+ }
+ PageReconstructError::Cancelled => {
+ ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+ }
PageReconstructError::WalRedo(err) => {
ApiError::InternalServerError(anyhow::Error::new(err))
}
}
}
+fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
+ match e {
+ TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+ ApiError::InternalServerError(anyhow::Error::new(e))
+ }
+ TenantMapInsertError::TenantAlreadyExists(id, state) => {
+ ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+ }
+ TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+ }
+}
+
// Helper function to construct a TimelineInfo struct for a timeline
async fn build_timeline_info(
timeline: &Arc,
include_non_incremental_logical_size: bool,
+ ctx: &RequestContext,
) -> anyhow::Result {
- let mut info = build_timeline_info_common(timeline)?;
+ let mut info = build_timeline_info_common(timeline, ctx)?;
if include_non_incremental_logical_size {
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -102,6 +126,7 @@ async fn build_timeline_info(
.get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
+ ctx,
)
.await?,
);
@@ -109,7 +134,10 @@ async fn build_timeline_info(
Ok(info)
}
-fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result {
+fn build_timeline_info_common(
+ timeline: &Arc,
+ ctx: &RequestContext,
+) -> anyhow::Result {
let last_record_lsn = timeline.get_last_record_lsn();
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
let guard = timeline.last_received_wal.lock().unwrap();
@@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result None,
lsn @ Lsn(_) => Some(lsn),
};
- let current_logical_size = match timeline.get_current_logical_size() {
+ let current_logical_size = match timeline.get_current_logical_size(ctx) {
Ok((size, _)) => Some(size),
Err(err) => {
error!("Timeline info creation failed to get current logical size: {err:?}");
@@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result {
// Created. Construct a TimelineInfo for it.
- let timeline_info = build_timeline_info_common(&new_timeline)
+ let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
@@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request) -> Result,
query_param_present(&request, "include-non-incremental-logical-size");
check_permission(&request, Some(tenant_id))?;
+ let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
let response_data = async {
let tenant = mgr::get_tenant(tenant_id, true)
.await
@@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request) -> Result,
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info =
- build_timeline_info(&timeline, include_non_incremental_logical_size)
+ build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
.await
.context(
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
@@ -239,11 +272,7 @@ fn query_param_present(request: &Request, param: &str) -> bool {
request
.uri()
.query()
- .map(|v| {
- url::form_urlencoded::parse(v.as_bytes())
- .into_owned()
- .any(|(p, _)| p == param)
- })
+ .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
.unwrap_or(false)
}
@@ -252,13 +281,12 @@ fn get_query_param(request: &Request, param_name: &str) -> Result) -> Result) -> Result(timeline_info)
}
@@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result) -> Result,
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
+ let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
info!("Handling tenant attach {tenant_id}");
let state = get_state(&request);
if let Some(remote_storage) = &state.remote_storage {
- // FIXME: distinguish between "Tenant already exists" and other errors
- mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+ mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await
- .map_err(ApiError::InternalServerError)?;
+ .map_err(apierror_from_tenant_map_insert_error)?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
@@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, A
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
+ let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
let state = get_state(&request);
- mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+ mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
.instrument(info_span!("load", tenant = %tenant_id))
.await
- .map_err(ApiError::InternalServerError)?;
+ .map_err(apierror_from_tenant_map_insert_error)?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request) -> Result, A
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
+ .map_err(anyhow::Error::new)
+ .map_err(ApiError::InternalServerError)?
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
@@ -453,21 +493,40 @@ async fn tenant_status(request: Request) -> Result, ApiErro
json_response(StatusCode::OK, tenant_info)
}
+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
async fn tenant_size_handler(request: Request) -> Result, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
+ let inputs_only = if query_param_present(&request, "inputs_only") {
+ get_query_param(&request, "inputs_only")?
+ .parse()
+ .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+ } else {
+ false
+ };
+
+ let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::InternalServerError)?;
- // this can be long operation, it currently is not backed by any request coalescing or similar
+ // this can be long operation
let inputs = tenant
- .gather_size_inputs()
+ .gather_size_inputs(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
- let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+ let size = if !inputs_only {
+ Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+ } else {
+ None
+ };
/// Private response type with the additional "unstable" `inputs` field.
///
@@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request) -> Result, A
#[serde_as(as = "serde_with::DisplayFromStr")]
id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes.
- size: u64,
+ ///
+ /// Will be none if `?inputs_only=true` was given.
+ size: Option,
inputs: crate::tenant::size::ModelInputs,
}
@@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
async fn tenant_create_handler(mut request: Request) -> Result, ApiError> {
check_permission(&request, None)?;
+ let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let mut tenant_conf = TenantConfOpt::default();
@@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request) -> Result {
- // We created the tenant. Existing API semantics are that the tenant
- // is Active when this function returns.
- if let res @ Err(_) = tenant.wait_to_become_active().await {
- // This shouldn't happen because we just created the tenant directory
- // in tenant::mgr::create_tenant, and there aren't any remote timelines
- // to load, so, nothing can really fail during load.
- // Don't do cleanup because we don't know how we got here.
- // The tenant will likely be in `Broken` state and subsequent
- // calls will fail.
- res.context("created tenant failed to become active")
- .map_err(ApiError::InternalServerError)?;
- }
- json_response(
- StatusCode::CREATED,
- TenantCreateResponse(tenant.tenant_id()),
- )?
- }
- None => json_response(StatusCode::CONFLICT, ())?,
- })
+ // We created the tenant. Existing API semantics are that the tenant
+ // is Active when this function returns.
+ if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+ // This shouldn't happen because we just created the tenant directory
+ // in tenant::mgr::create_tenant, and there aren't any remote timelines
+ // to load, so, nothing can really fail during load.
+ // Don't do cleanup because we don't know how we got here.
+ // The tenant will likely be in `Broken` state and subsequent
+ // calls will fail.
+ res.context("created tenant failed to become active")
+ .map_err(ApiError::InternalServerError)?;
+ }
+ json_response(
+ StatusCode::CREATED,
+ TenantCreateResponse(new_tenant.tenant_id()),
+ )
}
async fn tenant_config_handler(mut request: Request) -> Result, ApiError> {
@@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result Result<()> {
let mut pg_control: Option = None;
@@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir(
let mut file = tokio::fs::File::open(absolute_path).await?;
let len = metadata.len() as usize;
if let Some(control_file) =
- import_file(&mut modification, relative_path, &mut file, len).await?
+ import_file(&mut modification, relative_path, &mut file, len, ctx).await?
{
pg_control = Some(control_file);
}
@@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir(
tline,
Lsn(pg_control.checkPointCopy.redo),
pgdata_lsn,
+ ctx,
)
.await?;
@@ -113,6 +116,7 @@ async fn import_rel(
dboid: Oid,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
// Does it look like a relation file?
trace!("importing rel file {}", path.display());
@@ -147,7 +151,10 @@ async fn import_rel(
// FIXME: use proper error type for this, instead of parsing the error message.
// Or better yet, keep track of which relations we've already created
// https://github.com/neondatabase/neon/issues/3309
- if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
+ if let Err(e) = modification
+ .put_rel_creation(rel, nblocks as u32, ctx)
+ .await
+ {
if e.to_string().contains("already exists") {
debug!("relation {} already exists. we must be extending it", rel);
} else {
@@ -182,7 +189,7 @@ async fn import_rel(
//
// If we process rel segments out of order,
// put_rel_extend will skip the update.
- modification.put_rel_extend(rel, blknum).await?;
+ modification.put_rel_extend(rel, blknum, ctx).await?;
Ok(())
}
@@ -195,6 +202,7 @@ async fn import_slru(
path: &Path,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("importing slru file {path:?}");
@@ -211,7 +219,7 @@ async fn import_slru(
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
modification
- .put_slru_segment_creation(slru, segno, nblocks as u32)
+ .put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
.await?;
let mut rpageno = 0;
@@ -252,15 +260,15 @@ async fn import_wal(
tline: &Timeline,
startpoint: Lsn,
endpoint: Lsn,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
- use std::io::Read;
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = startpoint;
- let mut walingest = WalIngest::new(tline, startpoint).await?;
+ let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
while last_lsn <= endpoint {
// FIXME: assume postgresql tli 1 for now
@@ -283,6 +291,7 @@ async fn import_wal(
file.seek(std::io::SeekFrom::Start(offset as u64))?;
}
+ use std::io::Read;
let nread = file.read_to_end(&mut buf)?;
if nread != WAL_SEGMENT_SIZE - offset {
// Maybe allow this for .partial files?
@@ -297,7 +306,7 @@ async fn import_wal(
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
- .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+ .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
last_lsn = lsn;
@@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar(
tline: &Timeline,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
base_lsn: Lsn,
+ ctx: &RequestContext,
) -> Result<()> {
info!("importing base at {base_lsn}");
let mut modification = tline.begin_modification(base_lsn);
@@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar(
match header.entry_type() {
tokio_tar::EntryType::Regular => {
if let Some(res) =
- import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+ import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
{
// We found the pg_control file.
pg_control = Some(res);
@@ -376,13 +386,14 @@ pub async fn import_wal_from_tar(
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
start_lsn: Lsn,
end_lsn: Lsn,
+ ctx: &RequestContext,
) -> Result<()> {
// Set up walingest mutable state
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = start_lsn;
- let mut walingest = WalIngest::new(tline, start_lsn).await?;
+ let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
// Ingest wal until end_lsn
info!("importing wal until {}", end_lsn);
@@ -431,7 +442,7 @@ pub async fn import_wal_from_tar(
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
- .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+ .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
last_lsn = lsn;
@@ -466,6 +477,7 @@ async fn import_file(
file_path: &Path,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
+ ctx: &RequestContext,
) -> Result> {
let file_name = match file_path.file_name() {
Some(name) => name.to_string_lossy(),
@@ -498,14 +510,16 @@ async fn import_file(
}
"pg_filenode.map" => {
let bytes = read_all_bytes(reader).await?;
- modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+ modification
+ .put_relmap_file(spcnode, dbnode, bytes, ctx)
+ .await?;
debug!("imported relmap file")
}
"PG_VERSION" => {
debug!("ignored PG_VERSION file");
}
_ => {
- import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+ import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
debug!("imported rel creation");
}
}
@@ -521,38 +535,40 @@ async fn import_file(
match file_name.as_ref() {
"pg_filenode.map" => {
let bytes = read_all_bytes(reader).await?;
- modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+ modification
+ .put_relmap_file(spcnode, dbnode, bytes, ctx)
+ .await?;
debug!("imported relmap file")
}
"PG_VERSION" => {
debug!("ignored PG_VERSION file");
}
_ => {
- import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+ import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
debug!("imported rel creation");
}
}
} else if file_path.starts_with("pg_xact") {
let slru = SlruKind::Clog;
- import_slru(modification, slru, file_path, reader, len).await?;
+ import_slru(modification, slru, file_path, reader, len, ctx).await?;
debug!("imported clog slru");
} else if file_path.starts_with("pg_multixact/offsets") {
let slru = SlruKind::MultiXactOffsets;
- import_slru(modification, slru, file_path, reader, len).await?;
+ import_slru(modification, slru, file_path, reader, len, ctx).await?;
debug!("imported multixact offsets slru");
} else if file_path.starts_with("pg_multixact/members") {
let slru = SlruKind::MultiXactMembers;
- import_slru(modification, slru, file_path, reader, len).await?;
+ import_slru(modification, slru, file_path, reader, len, ctx).await?;
debug!("imported multixact members slru");
} else if file_path.starts_with("pg_twophase") {
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
let bytes = read_all_bytes(reader).await?;
modification
- .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
+ .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
.await?;
debug!("imported twophase file");
} else if file_path.starts_with("pg_wal") {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 91cde477ad..09e21ae755 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,9 @@
mod auth;
pub mod basebackup;
+pub mod broker_client;
pub mod config;
pub mod consumption_metrics;
+pub mod context;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
@@ -15,7 +17,6 @@ pub mod tenant;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
-pub mod walreceiver;
pub mod walrecord;
pub mod walredo;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b61e64048b..6bd0eddbb5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,12 @@
use metrics::core::{AtomicU64, GenericCounter};
use metrics::{
- register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
- register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
- IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+ register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+ Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+ UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
+use pageserver_api::models::state;
use utils::id::{TenantId, TimelineId};
/// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
"gc",
];
-pub static STORAGE_TIME: Lazy = Lazy::new(|| {
- register_histogram_vec!(
- "pageserver_storage_operations_seconds",
- "Time spent on storage operations",
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| {
+ register_counter_vec!(
+ "pageserver_storage_operations_seconds_sum",
+ "Total time spent on storage operations with operation, tenant and timeline dimensions",
&["operation", "tenant_id", "timeline_id"],
+ )
+ .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| {
+ register_int_counter_vec!(
+ "pageserver_storage_operations_seconds_count",
+ "Count of storage operations with operation, tenant and timeline dimensions",
+ &["operation", "tenant_id", "timeline_id"],
+ )
+ .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| {
+ register_histogram_vec!(
+ "pageserver_storage_operations_seconds_global",
+ "Time spent on storage operations",
+ &["operation"],
get_buckets_for_critical_operations(),
)
.expect("failed to define a metric")
@@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
+// Metrics collected on tenant states.
+const TENANT_STATE_OPTIONS: &[&str] = &[
+ state::LOADING,
+ state::ATTACHING,
+ state::ACTIVE,
+ state::STOPPING,
+ state::BROKEN,
+];
+
+pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| {
+ register_uint_gauge_vec!(
+ "pageserver_tenant_states_count",
+ "Count of tenants per state",
+ &["tenant_id", "state"]
+ )
+ .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| {
@@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| {
.unwrap()
});
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+pub struct StorageTimeMetricsTimer {
+ metrics: StorageTimeMetrics,
+ start: Instant,
+}
+
+impl StorageTimeMetricsTimer {
+ fn new(metrics: StorageTimeMetrics) -> Self {
+ Self {
+ metrics,
+ start: Instant::now(),
+ }
+ }
+
+ /// Record the time from creation to now.
+ pub fn stop_and_record(self) {
+ let duration = self.start.elapsed().as_secs_f64();
+ self.metrics.timeline_sum.inc_by(duration);
+ self.metrics.timeline_count.inc();
+ self.metrics.global_histogram.observe(duration);
+ }
+}
+
+/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
+/// timeline total sum and count.
+#[derive(Clone, Debug)]
+pub struct StorageTimeMetrics {
+ /// Sum of f64 seconds, per operation, tenant_id and timeline_id
+ timeline_sum: Counter,
+ /// Number of oeprations, per operation, tenant_id and timeline_id
+ timeline_count: IntCounter,
+ /// Global histogram having only the "operation" label.
+ global_histogram: Histogram,
+}
+
+impl StorageTimeMetrics {
+ pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+ let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
+ .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+ .unwrap();
+ let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
+ .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+ .unwrap();
+ let global_histogram = STORAGE_TIME_GLOBAL
+ .get_metric_with_label_values(&[operation])
+ .unwrap();
+
+ StorageTimeMetrics {
+ timeline_sum,
+ timeline_count,
+ global_histogram,
+ }
+ }
+
+ /// Starts timing a new operation.
+ ///
+ /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+ pub fn start_timer(&self) -> StorageTimeMetricsTimer {
+ StorageTimeMetricsTimer::new(self.clone())
+ }
+}
+
#[derive(Debug)]
pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub reconstruct_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter,
- pub flush_time_histo: Histogram,
- pub compact_time_histo: Histogram,
- pub create_images_time_histo: Histogram,
- pub init_logical_size_histo: Histogram,
- pub logical_size_histo: Histogram,
- pub load_layer_map_histo: Histogram,
+ pub flush_time_histo: StorageTimeMetrics,
+ pub compact_time_histo: StorageTimeMetrics,
+ pub create_images_time_histo: StorageTimeMetrics,
+ pub init_logical_size_histo: StorageTimeMetrics,
+ pub logical_size_histo: StorageTimeMetrics,
+ pub load_layer_map_histo: StorageTimeMetrics,
+ pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub resident_physical_size_gauge: UIntGauge,
@@ -406,24 +507,16 @@ impl TimelineMetrics {
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
- let flush_time_histo = STORAGE_TIME
- .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
- .unwrap();
- let compact_time_histo = STORAGE_TIME
- .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
- .unwrap();
- let create_images_time_histo = STORAGE_TIME
- .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
- .unwrap();
- let init_logical_size_histo = STORAGE_TIME
- .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
- .unwrap();
- let logical_size_histo = STORAGE_TIME
- .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
- .unwrap();
- let load_layer_map_histo = STORAGE_TIME
- .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
- .unwrap();
+ let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
+ let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+ let create_images_time_histo =
+ StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
+ let init_logical_size_histo =
+ StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
+ let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+ let load_layer_map_histo =
+ StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
+ let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
@@ -453,6 +546,7 @@ impl TimelineMetrics {
create_images_time_histo,
init_logical_size_histo,
logical_size_histo,
+ garbage_collect_histo,
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,
@@ -478,7 +572,10 @@ impl Drop for TimelineMetrics {
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
for op in STORAGE_TIME_OPERATIONS {
- let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+ let _ =
+ STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+ let _ =
+ STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in STORAGE_IO_TIME_OPERATIONS {
let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -495,7 +592,10 @@ impl Drop for TimelineMetrics {
}
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
- let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+ let tid = tenant_id.to_string();
+ for state in TENANT_STATE_OPTIONS {
+ let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+ }
}
use futures::Future;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 344a8d1c00..878928ae06 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::Context;
use bytes::Buf;
use bytes::Bytes;
use futures::{Stream, StreamExt};
+use pageserver_api::models::TenantState;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -30,19 +31,19 @@ use std::sync::Arc;
use std::time::Duration;
use tracing::*;
use utils::id::ConnectionId;
-use utils::postgres_backend_async::QueryError;
use utils::{
auth::{Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
- postgres_backend_async::{self, PostgresBackend},
+ postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
simple_rcu::RcuReadGuard,
};
use crate::auth::check_permission;
use crate::basebackup;
use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::task_mgr;
@@ -123,6 +124,7 @@ pub async fn libpq_listener_main(
auth: Option>,
listener: TcpListener,
auth_type: AuthType,
+ listener_ctx: RequestContext,
) -> anyhow::Result<()> {
listener.set_nonblocking(true)?;
let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -146,6 +148,9 @@ pub async fn libpq_listener_main(
debug!("accepted connection from {}", peer_addr);
let local_auth = auth.clone();
+ let connection_ctx = listener_ctx
+ .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+
// PageRequestHandler tasks are not associated with any particular
// timeline in the task manager. In practice most connections will
// only deal with a particular timeline, but we don't know which one
@@ -157,7 +162,7 @@ pub async fn libpq_listener_main(
None,
"serving compute connection task",
false,
- page_service_conn_main(conf, local_auth, socket, auth_type),
+ page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
);
}
Err(err) => {
@@ -177,6 +182,7 @@ async fn page_service_conn_main(
auth: Option>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
+ connection_ctx: RequestContext,
) -> anyhow::Result<()> {
// Immediately increment the gauge, then create a job to decrement it on task exit.
// One of the pros of `defer!` is that this will *most probably*
@@ -191,24 +197,24 @@ async fn page_service_conn_main(
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
- let mut conn_handler = PageServerHandler::new(conf, auth);
+ // XXX: pgbackend.run() should take the connection_ctx,
+ // and create a child per-query context when it invokes process_query.
+ // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
+ // and create the per-query context in process_query ourselves.
+ let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
- let result = pgbackend
+ match pgbackend
.run(&mut conn_handler, task_mgr::shutdown_watcher)
- .await;
- match result {
+ .await
+ {
Ok(()) => {
// we've been requested to shut down
Ok(())
}
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
- // `ConnectionReset` error happens when the Postgres client closes the connection.
- // As this disconnection happens quite often and is expected,
- // we decided to downgrade the logging level to `INFO`.
- // See: https://github.com/neondatabase/neon/issues/1683.
- if io_error.kind() == io::ErrorKind::ConnectionReset {
- info!("Postgres client disconnected");
+ if is_expected_io_error(&io_error) {
+ info!("Postgres client disconnected ({io_error})");
Ok(())
} else {
Err(io_error).context("Postgres connection error")
@@ -255,30 +261,42 @@ struct PageServerHandler {
_conf: &'static PageServerConf,
auth: Option>,
claims: Option,
+
+ /// The context created for the lifetime of the connection
+ /// services by this PageServerHandler.
+ /// For each query received over the connection,
+ /// `process_query` creates a child context from this one.
+ connection_ctx: RequestContext,
}
impl PageServerHandler {
- pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self {
+ pub fn new(
+ conf: &'static PageServerConf,
+ auth: Option>,
+ connection_ctx: RequestContext,
+ ) -> Self {
PageServerHandler {
_conf: conf,
auth,
claims: None,
+ connection_ctx,
}
}
- #[instrument(skip(self, pgb))]
+ #[instrument(skip(self, pgb, ctx))]
async fn handle_pagerequests(
&self,
pgb: &mut PostgresBackend,
tenant_id: TenantId,
timeline_id: TimelineId,
+ ctx: RequestContext,
) -> anyhow::Result<()> {
// NOTE: pagerequests handler exits when connection is closed,
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed
- let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+ let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path = tenant
@@ -329,22 +347,27 @@ impl PageServerHandler {
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+ // TODO: We could create a new per-request context here, with unique ID.
+ // Currently we use the same per-timeline context for all requests
+
let response = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let _timer = metrics.get_rel_exists.start_timer();
- self.handle_get_rel_exists_request(&timeline, &req).await
+ self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+ .await
}
PagestreamFeMessage::Nblocks(req) => {
let _timer = metrics.get_rel_size.start_timer();
- self.handle_get_nblocks_request(&timeline, &req).await
+ self.handle_get_nblocks_request(&timeline, &req, &ctx).await
}
PagestreamFeMessage::GetPage(req) => {
let _timer = metrics.get_page_at_lsn.start_timer();
- self.handle_get_page_at_lsn_request(&timeline, &req).await
+ self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+ .await
}
PagestreamFeMessage::DbSize(req) => {
let _timer = metrics.get_db_size.start_timer();
- self.handle_db_size_request(&timeline, &req).await
+ self.handle_db_size_request(&timeline, &req, &ctx).await
}
};
@@ -363,7 +386,8 @@ impl PageServerHandler {
Ok(())
}
- #[instrument(skip(self, pgb))]
+ #[allow(clippy::too_many_arguments)]
+ #[instrument(skip(self, pgb, ctx))]
async fn handle_import_basebackup(
&self,
pgb: &mut PostgresBackend,
@@ -372,12 +396,13 @@ impl PageServerHandler {
base_lsn: Lsn,
_end_lsn: Lsn,
pg_version: u32,
+ ctx: RequestContext,
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline
info!("creating new timeline");
- let tenant = get_active_tenant_with_timeout(tenant_id).await?;
- let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
+ let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+ let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
// TODO mark timeline as not ready until it reaches end_lsn.
// We might have some wal to import as well, and we should prevent compute
@@ -396,7 +421,7 @@ impl PageServerHandler {
let mut copyin_stream = Box::pin(copyin_stream(pgb));
timeline
- .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+ .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
.await?;
// Drain the rest of the Copy data
@@ -418,7 +443,7 @@ impl PageServerHandler {
Ok(())
}
- #[instrument(skip(self, pgb))]
+ #[instrument(skip(self, pgb, ctx))]
async fn handle_import_wal(
&self,
pgb: &mut PostgresBackend,
@@ -426,10 +451,11 @@ impl PageServerHandler {
timeline_id: TimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
+ ctx: RequestContext,
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
- let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+ let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(QueryError::Other(
@@ -446,7 +472,7 @@ impl PageServerHandler {
pgb.flush().await?;
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
- import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
+ import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete");
// Drain the rest of the Copy data
@@ -492,6 +518,7 @@ impl PageServerHandler {
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard,
+ ctx: &RequestContext,
) -> anyhow::Result {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
@@ -515,7 +542,7 @@ impl PageServerHandler {
if lsn <= last_record_lsn {
lsn = last_record_lsn;
} else {
- timeline.wait_lsn(lsn).await?;
+ timeline.wait_lsn(lsn, ctx).await?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
@@ -525,7 +552,7 @@ impl PageServerHandler {
if lsn == Lsn(0) {
anyhow::bail!("invalid LSN(0) in request");
}
- timeline.wait_lsn(lsn).await?;
+ timeline.wait_lsn(lsn, ctx).await?;
}
anyhow::ensure!(
lsn >= **latest_gc_cutoff_lsn,
@@ -535,52 +562,60 @@ impl PageServerHandler {
Ok(lsn)
}
- #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+ #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
async fn handle_get_rel_exists_request(
&self,
timeline: &Timeline,
req: &PagestreamExistsRequest,
+ ctx: &RequestContext,
) -> anyhow::Result {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
- let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
- .await?;
+ let lsn =
+ Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+ .await?;
- let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
+ let exists = timeline
+ .get_rel_exists(req.rel, lsn, req.latest, ctx)
+ .await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
exists,
}))
}
- #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+ #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
async fn handle_get_nblocks_request(
&self,
timeline: &Timeline,
req: &PagestreamNblocksRequest,
+ ctx: &RequestContext,
) -> anyhow::Result {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
- let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
- .await?;
+ let lsn =
+ Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+ .await?;
- let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
+ let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
}))
}
- #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+ #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
async fn handle_db_size_request(
&self,
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
+ ctx: &RequestContext,
) -> anyhow::Result {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
- let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
- .await?;
+ let lsn =
+ Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+ .await?;
let total_blocks = timeline
- .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+ .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -589,15 +624,17 @@ impl PageServerHandler {
}))
}
- #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+ #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
async fn handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
+ ctx: &RequestContext,
) -> anyhow::Result {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
- let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
- .await?;
+ let lsn =
+ Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+ .await?;
/*
// Add a 1s delay to some requests. The delay helps the requests to
// hit the race condition from github issue #1047 more easily.
@@ -608,7 +645,7 @@ impl PageServerHandler {
*/
let page = timeline
- .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+ .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -616,23 +653,25 @@ impl PageServerHandler {
}))
}
- #[instrument(skip(self, pgb))]
+ #[allow(clippy::too_many_arguments)]
+ #[instrument(skip(self, pgb, ctx))]
async fn handle_basebackup_request(
- &self,
+ &mut self,
pgb: &mut PostgresBackend,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Option,
prev_lsn: Option,
full_backup: bool,
+ ctx: RequestContext,
) -> anyhow::Result<()> {
// check that the timeline exists
- let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+ let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
- timeline.wait_lsn(lsn).await?;
+ timeline.wait_lsn(lsn, &ctx).await?;
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.context("invalid basebackup lsn")?;
@@ -645,8 +684,15 @@ impl PageServerHandler {
// Send a tarball of the latest layer on the timeline
{
let mut writer = pgb.copyout_writer();
- basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
- .await?;
+ basebackup::send_basebackup_tarball(
+ &mut writer,
+ &timeline,
+ lsn,
+ prev_lsn,
+ full_backup,
+ &ctx,
+ )
+ .await?;
}
pgb.write_message(&BeMessage::CopyDone)?;
@@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError> {
+ let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") {
@@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
self.check_permission(Some(tenant_id))?;
- self.handle_pagerequests(pgb, tenant_id, timeline_id)
+ self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
.await?;
} else if query_string.starts_with("basebackup ") {
let (_, params_raw) = query_string.split_at("basebackup ".len());
@@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
};
// Check that the timeline exists
- self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+ self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
.await?;
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
@@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
- let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+ let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let end_of_timeline = timeline.get_last_record_rlsn();
@@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
self.check_permission(Some(tenant_id))?;
// Check that the timeline exists
- self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+ self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
.await?;
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("import basebackup ") {
@@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
base_lsn,
end_lsn,
pg_version,
+ ctx,
)
.await
{
@@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
self.check_permission(Some(tenant_id))?;
match self
- .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
+ .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
.await
{
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
@@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
self.check_permission(Some(tenant_id))?;
- let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+ let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
pgb.write_message(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler {
}
}
+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+ #[error(
+ "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+ )]
+ WaitForActiveTimeout {
+ latest_state: TenantState,
+ wait_time: Duration,
+ },
+ #[error(transparent)]
+ Other(#[from] anyhow::Error),
+}
+
+impl From for QueryError {
+ fn from(e: GetActiveTenantError) -> Self {
+ match e {
+ GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+ ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+ ),
+ GetActiveTenantError::Other(e) => QueryError::Other(e),
+ }
+ }
+}
+
/// Get active tenant.
///
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result> {
+async fn get_active_tenant_with_timeout(
+ tenant_id: TenantId,
+ _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result, GetActiveTenantError> {
let tenant = mgr::get_tenant(tenant_id, false).await?;
- match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
- Ok(wait_result) => wait_result
- // no .context(), the error message is good enough and some tests depend on it
- .map(move |()| tenant),
- Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
+ let wait_time = Duration::from_secs(30);
+ match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+ Ok(Ok(())) => Ok(tenant),
+ // no .context(), the error message is good enough and some tests depend on it
+ Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+ Err(_) => {
+ let latest_state = tenant.current_state();
+ if latest_state == TenantState::Active {
+ Ok(tenant)
+ } else {
+ Err(GetActiveTenantError::WaitForActiveTimeout {
+ latest_state,
+ wait_time,
+ })
+ }
+ }
}
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_timeline_with_timeout(
+async fn get_active_tenant_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
-) -> anyhow::Result> {
- get_active_tenant_with_timeout(tenant_id)
- .await
- .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+ ctx: &RequestContext,
+) -> Result, GetActiveTenantError> {
+ let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
+ let timeline = tenant.get_timeline(timeline_id, true)?;
+ Ok(timeline)
}
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cc521c5e35..6f9035305d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,6 +7,7 @@
//! Clarify that)
//!
use super::tenant::{PageReconstructError, Timeline};
+use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::walrecord::NeonWalRecord;
@@ -97,6 +98,7 @@ impl Timeline {
blknum: BlockNumber,
lsn: Lsn,
latest: bool,
+ ctx: &RequestContext,
) -> Result {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -104,7 +106,7 @@ impl Timeline {
)));
}
- let nblocks = self.get_rel_size(tag, lsn, latest).await?;
+ let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -114,7 +116,7 @@ impl Timeline {
}
let key = rel_block_to_key(tag, blknum);
- self.get(key, lsn).await
+ self.get(key, lsn, ctx).await
}
// Get size of a database in blocks
@@ -124,13 +126,14 @@ impl Timeline {
dbnode: Oid,
lsn: Lsn,
latest: bool,
+ ctx: &RequestContext,
) -> Result {
let mut total_blocks = 0;
- let rels = self.list_rels(spcnode, dbnode, lsn).await?;
+ let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
for rel in rels {
- let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
+ let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
@@ -142,6 +145,7 @@ impl Timeline {
tag: RelTag,
lsn: Lsn,
latest: bool,
+ ctx: &RequestContext,
) -> Result {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -154,7 +158,7 @@ impl Timeline {
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
- && !self.get_rel_exists(tag, lsn, latest).await?
+ && !self.get_rel_exists(tag, lsn, latest, ctx).await?
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
@@ -164,7 +168,7 @@ impl Timeline {
}
let key = rel_size_to_key(tag);
- let mut buf = self.get(key, lsn).await?;
+ let mut buf = self.get(key, lsn, ctx).await?;
let nblocks = buf.get_u32_le();
if latest {
@@ -186,6 +190,7 @@ impl Timeline {
tag: RelTag,
lsn: Lsn,
_latest: bool,
+ ctx: &RequestContext,
) -> Result {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -199,7 +204,7 @@ impl Timeline {
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
- let buf = self.get(key, lsn).await?;
+ let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -216,10 +221,11 @@ impl Timeline {
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result, PageReconstructError> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
- let buf = self.get(key, lsn).await?;
+ let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -244,9 +250,10 @@ impl Timeline {
segno: u32,
blknum: BlockNumber,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result {
let key = slru_block_to_key(kind, segno, blknum);
- self.get(key, lsn).await
+ self.get(key, lsn, ctx).await
}
/// Get size of an SLRU segment
@@ -255,9 +262,10 @@ impl Timeline {
kind: SlruKind,
segno: u32,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result {
let key = slru_segment_size_to_key(kind, segno);
- let mut buf = self.get(key, lsn).await?;
+ let mut buf = self.get(key, lsn, ctx).await?;
Ok(buf.get_u32_le())
}
@@ -267,10 +275,11 @@ impl Timeline {
kind: SlruKind,
segno: u32,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result {
// fetch directory listing
let key = slru_dir_to_key(kind);
- let buf = self.get(key, lsn).await?;
+ let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -291,6 +300,7 @@ impl Timeline {
pub async fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
+ ctx: &RequestContext,
) -> Result {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let min_lsn = *gc_cutoff_lsn_guard;
@@ -313,6 +323,7 @@ impl Timeline {
Lsn(mid * 8),
&mut found_smaller,
&mut found_larger,
+ ctx,
)
.await?;
@@ -362,14 +373,18 @@ impl Timeline {
probe_lsn: Lsn,
found_smaller: &mut bool,
found_larger: &mut bool,
+ ctx: &RequestContext,
) -> Result {
- for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
+ for segno in self
+ .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+ .await?
+ {
let nblocks = self
- .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
+ .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
- .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
+ .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
.await?;
if clog_page.len() == BLCKSZ as usize + 8 {
@@ -394,11 +409,12 @@ impl Timeline {
&self,
kind: SlruKind,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result, PageReconstructError> {
// fetch directory entry
let key = slru_dir_to_key(kind);
- let buf = self.get(key, lsn).await?;
+ let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.segments),
Err(e) => Err(PageReconstructError::from(e)),
@@ -410,18 +426,21 @@ impl Timeline {
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result {
let key = relmap_file_key(spcnode, dbnode);
- self.get(key, lsn).await
+ let buf = self.get(key, lsn, ctx).await?;
+ Ok(buf)
}
pub async fn list_dbdirs(
&self,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result, PageReconstructError> {
// fetch directory entry
- let buf = self.get(DBDIR_KEY, lsn).await?;
+ let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.dbdirs),
@@ -433,18 +452,20 @@ impl Timeline {
&self,
xid: TransactionId,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result {
let key = twophase_file_key(xid);
- let buf = self.get(key, lsn).await?;
+ let buf = self.get(key, lsn, ctx).await?;
Ok(buf)
}
pub async fn list_twophase_files(
&self,
lsn: Lsn,
+ ctx: &RequestContext,
) -> Result, PageReconstructError> {
// fetch directory entry
- let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+ let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.xids),
@@ -452,12 +473,20 @@ impl Timeline {
}
}
- pub async fn get_control_file(&self, lsn: Lsn) -> Result {
- self.get(CONTROLFILE_KEY, lsn).await
+ pub async fn get_control_file(
+ &self,
+ lsn: Lsn,
+ ctx: &RequestContext,
+ ) -> Result {
+ self.get(CONTROLFILE_KEY, lsn, ctx).await
}
- pub async fn get_checkpoint(&self, lsn: Lsn) -> Result {
- self.get(CHECKPOINT_KEY, lsn).await
+ pub async fn get_checkpoint(
+ &self,
+ lsn: Lsn,
+ ctx: &RequestContext,
+ ) -> Result {
+ self.get(CHECKPOINT_KEY, lsn, ctx).await
}
/// Does the same as get_current_logical_size but counted on demand.
@@ -469,15 +498,16 @@ impl Timeline {
&self,
lsn: Lsn,
cancel: CancellationToken,
+ ctx: &RequestContext,
) -> Result {
// Fetch list of database dirs and iterate them
- let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
+ let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
- .list_rels(*spcnode, *dbnode, lsn)
+ .list_rels(*spcnode, *dbnode, lsn, ctx)
.await
.context("list rels")?
{
@@ -486,9 +516,9 @@ impl Timeline {
}
let relsize_key = rel_size_to_key(rel);
let mut buf = self
- .get(relsize_key, lsn)
+ .get(relsize_key, lsn, ctx)
.await
- .context("read relation size of {rel:?}")?;
+ .with_context(|| format!("read relation size of {rel:?}"))?;
let relsize = buf.get_u32_le();
total_size += relsize as u64;
@@ -501,7 +531,11 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
- pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result {
+ pub async fn collect_keyspace(
+ &self,
+ lsn: Lsn,
+ ctx: &RequestContext,
+ ) -> anyhow::Result {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -509,7 +543,7 @@ impl Timeline {
result.add_key(DBDIR_KEY);
// Fetch list of database dirs and iterate them
- let buf = self.get(DBDIR_KEY, lsn).await?;
+ let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -519,14 +553,14 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec = self
- .list_rels(spcnode, dbnode, lsn)
+ .list_rels(spcnode, dbnode, lsn, ctx)
.await?
.into_iter()
.collect();
rels.sort_unstable();
for rel in rels {
let relsize_key = rel_size_to_key(rel);
- let mut buf = self.get(relsize_key, lsn).await?;
+ let mut buf = self.get(relsize_key, lsn, ctx).await?;
let relsize = buf.get_u32_le();
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -542,13 +576,13 @@ impl Timeline {
] {
let slrudir_key = slru_dir_to_key(kind);
result.add_key(slrudir_key);
- let buf = self.get(slrudir_key, lsn).await?;
+ let buf = self.get(slrudir_key, lsn, ctx).await?;
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
let mut segments: Vec = dir.segments.iter().cloned().collect();
segments.sort_unstable();
for segno in segments {
let segsize_key = slru_segment_size_to_key(kind, segno);
- let mut buf = self.get(segsize_key, lsn).await?;
+ let mut buf = self.get(segsize_key, lsn, ctx).await?;
let segsize = buf.get_u32_le();
result.add_range(
@@ -560,7 +594,7 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
- let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+ let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
let mut xids: Vec = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
@@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> {
spcnode: Oid,
dbnode: Oid,
img: Bytes,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
// Add it to the directory (if it doesn't exist already)
- let buf = self.get(DBDIR_KEY).await?;
+ let buf = self.get(DBDIR_KEY, ctx).await?;
let mut dbdir = DbDirectory::des(&buf)?;
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> {
&mut self,
xid: TransactionId,
img: Bytes,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
// Add it to the directory entry
- let buf = self.get(TWOPHASEDIR_KEY).await?;
+ let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
@@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
- pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
+ pub async fn drop_dbdir(
+ &mut self,
+ spcnode: Oid,
+ dbnode: Oid,
+ ctx: &RequestContext,
+ ) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self
.tline
- .get_db_size(spcnode, dbnode, req_lsn, true)
+ .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
.await?;
// Remove entry from dbdir
- let buf = self.get(DBDIR_KEY).await?;
+ let buf = self.get(DBDIR_KEY, ctx).await?;
let mut dir = DbDirectory::des(&buf)?;
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
let buf = DbDirectory::ser(&dir)?;
@@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> {
&mut self,
rel: RelTag,
nblocks: BlockNumber,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
- let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
+ let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
@@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> {
RelDirectory::default()
} else {
// reldir already exists, fetch it
- RelDirectory::des(&self.get(rel_dir_key).await?)?
+ RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
};
// Add the new relation to the rel directory entry, and write it back
@@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> {
&mut self,
rel: RelTag,
nblocks: BlockNumber,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
let last_lsn = self.tline.get_last_record_lsn();
- if self.tline.get_rel_exists(rel, last_lsn, true).await? {
+ if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
let size_key = rel_size_to_key(rel);
// Fetch the old size first
- let old_size = self.get(size_key).await?.get_u32_le();
+ let old_size = self.get(size_key, ctx).await?.get_u32_le();
// Update the entry with the new size.
let buf = nblocks.to_le_bytes();
@@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> {
&mut self,
rel: RelTag,
nblocks: BlockNumber,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Put size
let size_key = rel_size_to_key(rel);
- let old_size = self.get(size_key).await?.get_u32_le();
+ let old_size = self.get(size_key, ctx).await?.get_u32_le();
// only extend relation here. never decrease the size
if nblocks > old_size {
@@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> {
}
/// Drop a relation.
- pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+ pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Remove it from the directory entry
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
- let buf = self.get(dir_key).await?;
+ let buf = self.get(dir_key, ctx).await?;
let mut dir = RelDirectory::des(&buf)?;
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> {
// update logical size
let size_key = rel_size_to_key(rel);
- let old_size = self.get(size_key).await?.get_u32_le();
+ let old_size = self.get(size_key, ctx).await?.get_u32_le();
self.pending_nblocks -= old_size as i64;
// Remove enty from relation size cache
@@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
// Add it to the directory entry
let dir_key = slru_dir_to_key(kind);
- let buf = self.get(dir_key).await?;
+ let buf = self.get(dir_key, ctx).await?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.insert(segno) {
@@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> {
}
/// This method is used for marking truncated SLRU files
- pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
+ pub async fn drop_slru_segment(
+ &mut self,
+ kind: SlruKind,
+ segno: u32,
+ ctx: &RequestContext,
+ ) -> anyhow::Result<()> {
// Remove it from the directory entry
let dir_key = slru_dir_to_key(kind);
- let buf = self.get(dir_key).await?;
+ let buf = self.get(dir_key, ctx).await?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.remove(&segno) {
@@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> {
}
/// This method is used for marking truncated SLRU files
- pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+ pub async fn drop_twophase_file(
+ &mut self,
+ xid: TransactionId,
+ ctx: &RequestContext,
+ ) -> anyhow::Result<()> {
// Remove it from the directory entry
- let buf = self.get(TWOPHASEDIR_KEY).await?;
+ let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> {
// Internal helper functions to batch the modifications
- async fn get(&self, key: Key) -> Result {
+ async fn get(&self, key: Key, ctx: &RequestContext) -> Result {
// Have we already updated the same key? Read the pending updated
// version in that case.
//
@@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> {
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
- self.tline.get(key, lsn).await
+ self.tline.get(key, lsn, ctx).await
}
}
@@ -1542,10 +1596,11 @@ pub fn create_test_timeline(
tenant: &crate::tenant::Tenant,
timeline_id: utils::id::TimelineId,
pg_version: u32,
+ ctx: &RequestContext,
) -> anyhow::Result> {
let tline = tenant
- .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
- .initialize()?;
+ .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
+ .initialize(ctx)?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;
m.commit()?;
@@ -1598,7 +1653,7 @@ mod tests {
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
// Create a branch, check that the relation is visible there
- repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+ repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
Some(timeline) => timeline,
None => panic!("Should have a local timeline"),
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 586fd20886..092503b7c5 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
| self.field6 as i128
}
+ pub fn from_i128(x: i128) -> Self {
+ Key {
+ field1: ((x >> 120) & 0xf) as u8,
+ field2: ((x >> 104) & 0xFFFF) as u32,
+ field3: (x >> 72) as u32,
+ field4: (x >> 40) as u32,
+ field5: (x >> 32) as u8,
+ field6: x as u32,
+ }
+ }
+
pub fn next(&self) -> Key {
self.add(1)
}
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 02e2e2ee14..09716ba0e0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -171,6 +171,9 @@ task_local! {
///
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum TaskKind {
+ // Pageserver startup, i.e., `main`
+ Startup,
+
// libpq listener task. It just accepts connection and spawns a
// PageRequestHandler task for each connection.
LibpqEndpointListener,
@@ -183,13 +186,37 @@ pub enum TaskKind {
// associated with one later, after receiving a command from the client.
PageRequestHandler,
- // Manages the WAL receiver connection for one timeline. It subscribes to
- // events from storage_broker, decides which safekeeper to connect to. It spawns a
- // separate WalReceiverConnection task to handle each connection.
+ /// Manages the WAL receiver connection for one timeline.
+ /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+ /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+ /// There is at most one connection at any given time.
+ ///
+ /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+ /// The `Client` object is what library users use to make requests & get responses.
+ /// Internally, `Client` hands over requests to the `Connection` object.
+ /// The `Connection` object is responsible for speaking the wire protocol.
+ ///
+ /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+ /// That abstraction doesn't use `task_mgr`.
+ /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+ /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
+ ///
+ /// Once the connection is established, the `TaskHandle` task creates a
+ /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+ /// the `Connection` object.
+ /// A `CancellationToken` created by the `TaskHandle` task ensures
+ /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
WalReceiverManager,
- // Handles a connection to a safekeeper, to stream WAL to a timeline.
- WalReceiverConnection,
+ /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
+ /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
+ /// See the comment on [`WalReceiverManager`].
+ WalReceiverConnectionHandler,
+
+ /// The task that polls the `tokio-postgres::Connection` object.
+ /// Spawned by task [`WalReceiverConnectionHandler`].
+ /// See the comment on [`WalReceiverManager`].
+ WalReceiverConnectionPoller,
// Garbage collection worker. One per tenant
GarbageCollector,
@@ -200,6 +227,8 @@ pub enum TaskKind {
// Initial logical size calculation
InitialLogicalSizeCalculation,
+ OndemandLogicalSizeCalculation,
+
// Task that flushes frozen in-memory layers to disk
LayerFlushTask,
@@ -222,6 +251,12 @@ pub enum TaskKind {
DownloadAllRemoteLayers,
// Task that calculates synthetis size for all active tenants
CalculateSyntheticSize,
+
+ // A request that comes in via the pageserver HTTP API.
+ MgmtRequest,
+
+ #[cfg(test)]
+ UnitTest,
}
#[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c18c645e5b..2f45fe0dfc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,9 +48,10 @@ use std::time::{Duration, Instant};
use self::metadata::TimelineMetadata;
use self::remote_timeline_client::RemoteTimelineClient;
use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
use crate::is_uninit_mark;
-use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC};
use crate::repository::GcResult;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
@@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> {
///
/// The new timeline is initialized in Active state, and its background jobs are
/// started
- pub fn initialize(self) -> anyhow::Result> {
+ pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result> {
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
self.initialize_with_lock(&mut timelines, true, true)
}
@@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> {
mut self,
timelines: &mut HashMap>,
load_layer_map: bool,
- launch_wal_receiver: bool,
+ activate: bool,
) -> anyhow::Result> {
let timeline_id = self.timeline_id;
let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> {
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
)
})?;
- new_timeline.set_state(TimelineState::Active);
v.insert(Arc::clone(&new_timeline));
new_timeline.maybe_spawn_flush_loop();
- if launch_wal_receiver {
- new_timeline.launch_wal_receiver();
+ if activate {
+ new_timeline.activate();
}
}
}
@@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> {
self,
copyin_stream: &mut (impl Stream- > + Sync + Send + Unpin),
base_lsn: Lsn,
+ ctx: &RequestContext,
) -> anyhow::Result
> {
let raw_timeline = self.raw_timeline()?;
let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
- import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+ import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
.await
.context("Failed to import basebackup")?;
@@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> {
.await
.context("Failed to flush after basebackup import")?;
- let timeline = self.initialize()?;
-
- Ok(timeline)
+ self.initialize(ctx)
}
fn raw_timeline(&self) -> anyhow::Result<&Arc> {
@@ -450,6 +449,7 @@ impl Tenant {
///
/// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
/// it is marked as Active.
+ #[allow(clippy::too_many_arguments)]
async fn timeline_init_and_sync(
&self,
timeline_id: TimelineId,
@@ -458,6 +458,7 @@ impl Tenant {
local_metadata: Option,
ancestor: Option>,
first_save: bool,
+ _ctx: &RequestContext,
) -> anyhow::Result<()> {
let tenant_id = self.tenant_id;
@@ -573,6 +574,7 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: GenericRemoteStorage,
+ ctx: &RequestContext,
) -> Arc {
// XXX: Attach should provide the config, especially during tenant migration.
// See https://github.com/neondatabase/neon/issues/1555
@@ -591,6 +593,7 @@ impl Tenant {
// Do all the hard work in the background
let tenant_clone = Arc::clone(&tenant);
+ let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Attach,
@@ -599,7 +602,7 @@ impl Tenant {
"attach tenant",
false,
async move {
- match tenant_clone.attach().await {
+ match tenant_clone.attach(ctx).await {
Ok(_) => {}
Err(e) => {
tenant_clone.set_broken(&e.to_string());
@@ -615,8 +618,8 @@ impl Tenant {
///
/// Background task that downloads all data for a tenant and brings it to Active state.
///
- #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
- async fn attach(self: &Arc) -> anyhow::Result<()> {
+ #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+ async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> {
// Create directory with marker file to indicate attaching state.
// The load_local_tenants() function in tenant::mgr relies on the marker file
// to determine whether a tenant has finished attaching.
@@ -716,6 +719,7 @@ impl Tenant {
index_parts.remove(&timeline_id).unwrap(),
remote_metadata,
remote_clients.remove(&timeline_id).unwrap(),
+ &ctx,
)
.await
.with_context(|| {
@@ -765,6 +769,7 @@ impl Tenant {
index_part: IndexPart,
remote_metadata: TimelineMetadata,
remote_client: RemoteTimelineClient,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("downloading index file for timeline {}", timeline_id);
tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -799,6 +804,7 @@ impl Tenant {
local_metadata,
ancestor,
true,
+ ctx,
)
.await
}
@@ -827,11 +833,12 @@ impl Tenant {
/// If the loading fails for some reason, the Tenant will go into Broken
/// state.
///
- #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))]
+ #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
pub fn spawn_load(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: Option,
+ ctx: &RequestContext,
) -> Arc {
let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
Ok(conf) => conf,
@@ -855,6 +862,7 @@ impl Tenant {
// Do all the hard work in a background task
let tenant_clone = Arc::clone(&tenant);
+ let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn);
let _ = task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::InitialLoad,
@@ -863,7 +871,7 @@ impl Tenant {
"initial tenant load",
false,
async move {
- match tenant_clone.load().await {
+ match tenant_clone.load(&ctx).await {
Ok(()) => {}
Err(err) => {
tenant_clone.set_broken(&err.to_string());
@@ -884,8 +892,8 @@ impl Tenant {
/// Background task to load in-memory data structures for this tenant, from
/// files on disk. Used at pageserver startup.
///
- #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
- async fn load(self: &Arc) -> anyhow::Result<()> {
+ #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+ async fn load(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> {
info!("loading tenant task");
utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -996,7 +1004,7 @@ impl Tenant {
// 1. "Timeline has no ancestor and no layer files"
for (timeline_id, local_metadata) in sorted_timelines {
- self.load_local_timeline(timeline_id, local_metadata)
+ self.load_local_timeline(timeline_id, local_metadata, ctx)
.await
.with_context(|| format!("load local timeline {timeline_id}"))?;
}
@@ -1013,11 +1021,12 @@ impl Tenant {
/// Subroutine of `load_tenant`, to load an individual timeline
///
/// NB: The parent is assumed to be already loaded!
- #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))]
+ #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
async fn load_local_timeline(
&self,
timeline_id: TimelineId,
local_metadata: TimelineMetadata,
+ ctx: &RequestContext,
) -> anyhow::Result<()> {
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1061,6 +1070,7 @@ impl Tenant {
Some(local_metadata),
ancestor,
false,
+ ctx,
)
.await
}
@@ -1112,6 +1122,7 @@ impl Tenant {
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
+ _ctx: &RequestContext,
) -> anyhow::Result {
anyhow::ensure!(
self.is_active(),
@@ -1153,6 +1164,7 @@ impl Tenant {
ancestor_timeline_id: Option,
mut ancestor_start_lsn: Option,
pg_version: u32,
+ ctx: &RequestContext,
) -> anyhow::Result>> {
anyhow::ensure!(
self.is_active(),
@@ -1190,13 +1202,16 @@ impl Tenant {
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
- ancestor_timeline.wait_lsn(*lsn).await?;
+ ancestor_timeline.wait_lsn(*lsn, ctx).await?;
}
- self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+ self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+ .await?
+ }
+ None => {
+ self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
.await?
}
- None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
};
Ok(Some(loaded_timeline))
@@ -1220,30 +1235,25 @@ impl Tenant {
target_timeline_id: Option,
horizon: u64,
pitr: Duration,
+ ctx: &RequestContext,
) -> anyhow::Result {
anyhow::ensure!(
self.is_active(),
"Cannot run GC iteration on inactive tenant"
);
- let timeline_str = target_timeline_id
- .map(|x| x.to_string())
- .unwrap_or_else(|| "-".to_string());
+ let gc_result = self
+ .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+ .await;
- {
- let _timer = STORAGE_TIME
- .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
- .start_timer();
- self.gc_iteration_internal(target_timeline_id, horizon, pitr)
- .await
- }
+ gc_result
}
/// Perform one compaction iteration.
/// This function is periodically called by compactor task.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
- pub async fn compaction_iteration(&self) -> anyhow::Result<()> {
+ pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
anyhow::ensure!(
self.is_active(),
"Cannot run compaction iteration on inactive tenant"
@@ -1265,7 +1275,7 @@ impl Tenant {
for (timeline_id, timeline) in &timelines_to_compact {
timeline
- .compact()
+ .compact(ctx)
.instrument(info_span!("compact_timeline", timeline = %timeline_id))
.await?;
}
@@ -1298,7 +1308,11 @@ impl Tenant {
}
/// Removes timeline-related in-memory data
- pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+ pub async fn delete_timeline(
+ &self,
+ timeline_id: TimelineId,
+ _ctx: &RequestContext,
+ ) -> anyhow::Result<()> {
// Transition the timeline into TimelineState::Stopping.
// This should prevent new operations from starting.
let timeline = {
@@ -1462,8 +1476,7 @@ impl Tenant {
tasks::start_background_loops(self.tenant_id);
for timeline in not_broken_timelines {
- timeline.set_state(TimelineState::Active);
- timeline.launch_wal_receiver();
+ timeline.activate();
}
}
}
@@ -1487,7 +1500,7 @@ impl Tenant {
.values()
.filter(|timeline| timeline.current_state() != TimelineState::Broken);
for timeline in not_broken_timelines {
- timeline.set_state(TimelineState::Suspended);
+ timeline.set_state(TimelineState::Stopping);
}
}
TenantState::Broken => {
@@ -1717,7 +1730,33 @@ impl Tenant {
tenant_id: TenantId,
remote_storage: Option,
) -> Tenant {
- let (state, _) = watch::channel(state);
+ let (state, mut rx) = watch::channel(state);
+
+ tokio::spawn(async move {
+ let current_state = *rx.borrow_and_update();
+ let tid = tenant_id.to_string();
+ TENANT_STATE_METRIC
+ .with_label_values(&[&tid, current_state.as_str()])
+ .inc();
+ loop {
+ match rx.changed().await {
+ Ok(()) => {
+ let new_state = *rx.borrow();
+ TENANT_STATE_METRIC
+ .with_label_values(&[&tid, current_state.as_str()])
+ .dec();
+ TENANT_STATE_METRIC
+ .with_label_values(&[&tid, new_state.as_str()])
+ .inc();
+ }
+ Err(_sender_dropped_error) => {
+ info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+ return;
+ }
+ }
+ }
+ });
+
Tenant {
tenant_id,
conf,
@@ -1776,69 +1815,70 @@ impl Tenant {
}
pub(super) fn persist_tenant_config(
+ tenant_id: &TenantId,
target_config_path: &Path,
tenant_conf: TenantConfOpt,
- first_save: bool,
+ creating_tenant: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving tenantconf").entered();
- info!("persisting tenantconf to {}", target_config_path.display());
- // TODO this will prepend comments endlessly ?
- let mut conf_content = r#"# This file contains a specific per-tenant's config.
-# It is read in case of pageserver restart.
-
-[tenant_config]
-"#
- .to_string();
-
- // Convert the config to a toml file.
- conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
-
- let mut target_config_file = VirtualFile::open_with_options(
- target_config_path,
- OpenOptions::new()
- .truncate(true) // This needed for overwriting with small config files
- .write(true)
- .create_new(first_save),
- )?;
-
- target_config_file
- .write(conf_content.as_bytes())
- .context("Failed to write toml bytes into file")
- .and_then(|_| {
- target_config_file
- .sync_all()
- .context("Faile to fsync config file")
- })
- .with_context(|| {
+ // imitate a try-block with a closure
+ let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+ let target_config_parent = target_config_path.parent().with_context(|| {
format!(
- "Failed to write config file into path '{}'",
+ "Config path does not have a parent: {}",
target_config_path.display()
)
})?;
- // fsync the parent directory to ensure the directory entry is durable
- if first_save {
- target_config_path
- .parent()
- .context("Config file does not have a parent")
- .and_then(|target_config_parent| {
- File::open(target_config_parent).context("Failed to open config parent")
- })
- .and_then(|tenant_dir| {
- tenant_dir
- .sync_all()
- .context("Failed to fsync config parent")
- })
- .with_context(|| {
- format!(
- "Failed to fsync on first save for config {}",
- target_config_path.display()
- )
- })?;
- }
+ info!("persisting tenantconf to {}", target_config_path.display());
- Ok(())
+ let mut conf_content = r#"# This file contains a specific per-tenant's config.
+# It is read in case of pageserver restart.
+
+[tenant_config]
+"#
+ .to_string();
+
+ // Convert the config to a toml file.
+ conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+
+ let mut target_config_file = VirtualFile::open_with_options(
+ target_config_path,
+ OpenOptions::new()
+ .truncate(true) // This needed for overwriting with small config files
+ .write(true)
+ .create_new(creating_tenant)
+ // when creating a new tenant, first_save will be true and `.create(true)` will be
+ // ignored (per rust std docs).
+ //
+ // later when updating the config of created tenant, or persisting config for the
+ // first time for attached tenant, the `.create(true)` is used.
+ .create(true),
+ )?;
+
+ target_config_file
+ .write(conf_content.as_bytes())
+ .context("write toml bytes into file")
+ .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+ .context("write config file")?;
+
+ // fsync the parent directory to ensure the directory entry is durable.
+ // before this was done conditionally on creating_tenant, but these management actions are rare
+ // enough to just fsync it always.
+
+ crashsafe::fsync(target_config_parent)?;
+ Ok(())
+ };
+
+ // this function is called from creating the tenant and updating the tenant config, which
+ // would otherwise share this context, so keep it here in one place.
+ do_persist(target_config_path).with_context(|| {
+ format!(
+ "write tenant {tenant_id} config to {}",
+ target_config_path.display()
+ )
+ })
}
//
@@ -1871,12 +1911,13 @@ impl Tenant {
target_timeline_id: Option,
horizon: u64,
pitr: Duration,
+ ctx: &RequestContext,
) -> anyhow::Result {
let mut totals: GcResult = Default::default();
let now = Instant::now();
let gc_timelines = self
- .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+ .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
.await?;
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -1917,7 +1958,10 @@ impl Tenant {
/// [`Tenant::get_gc_horizon`].
///
/// This is usually executed as part of periodic gc, but can now be triggered more often.
- pub async fn refresh_gc_info(&self) -> anyhow::Result>> {
+ pub async fn refresh_gc_info(
+ &self,
+ ctx: &RequestContext,
+ ) -> anyhow::Result>> {
// since this method can now be called at different rates than the configured gc loop, it
// might be that these configuration values get applied faster than what it was previously,
// since these were only read from the gc task.
@@ -1927,7 +1971,7 @@ impl Tenant {
// refresh all timelines
let target_timeline_id = None;
- self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+ self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
.await
}
@@ -1936,6 +1980,7 @@ impl Tenant {
target_timeline_id: Option,
horizon: u64,
pitr: Duration,
+ ctx: &RequestContext,
) -> anyhow::Result>> {
// grab mutex to prevent new timelines from being created here.
let gc_cs = self.gc_cs.lock().await;
@@ -2007,7 +2052,9 @@ impl Tenant {
))
.map(|&x| x.1)
.collect();
- timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
+ timeline
+ .update_gc_info(branchpoints, cutoff, pitr, ctx)
+ .await?;
gc_timelines.push(timeline);
}
@@ -2019,53 +2066,53 @@ impl Tenant {
/// Branch an existing timeline
async fn branch_timeline(
&self,
- src: TimelineId,
- dst: TimelineId,
+ src_timeline: &Arc,
+ dst_id: TimelineId,
start_lsn: Option,
+ _ctx: &RequestContext,
) -> anyhow::Result> {
- // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
- // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
- // concurrently removes data that is needed by the new timeline.
- let _gc_cs = self.gc_cs.lock().await;
- let timeline_uninit_mark = {
- let timelines = self.timelines.lock().unwrap();
- self.create_timeline_uninit_mark(dst, &timelines)?
- };
-
- // In order for the branch creation task to not wait for GC/compaction,
- // we need to make sure that the starting LSN of the child branch is not out of scope midway by
- //
- // 1. holding the GC lock to prevent overwritting timeline's GC data
- // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
- //
- // Step 2 is to avoid initializing the new branch using data removed by past GC iterations
- // or in-queue GC iterations.
-
- let src_timeline = self.get_timeline(src, false).with_context(|| {
- format!(
- "No ancestor {} found for timeline {}/{}",
- src, self.tenant_id, dst
- )
- })?;
-
- let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+ let src_id = src_timeline.timeline_id;
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
let start_lsn = start_lsn.unwrap_or_else(|| {
let lsn = src_timeline.get_last_record_lsn();
- info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
+ info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
lsn
});
- // Check if the starting LSN is out of scope because it is less than
- // 1. the latest GC cutoff LSN or
- // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
+ // First acquire the GC lock so that another task cannot advance the GC
+ // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+ // creating the branch.
+ let _gc_cs = self.gc_cs.lock().await;
+
+ // Create a placeholder for the new branch. This will error
+ // out if the new timeline ID is already in use.
+ let timeline_uninit_mark = {
+ let timelines = self.timelines.lock().unwrap();
+ self.create_timeline_uninit_mark(dst_id, &timelines)?
+ };
+
+ // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
+ // horizon on the source timeline
+ //
+ // We check it against both the planned GC cutoff stored in 'gc_info',
+ // and the 'latest_gc_cutoff' of the last GC that was performed. The
+ // planned GC cutoff in 'gc_info' is normally larger than
+ // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+ // changed the GC settings for the tenant to make the PITR window
+ // larger, but some of the data was already removed by an earlier GC
+ // iteration.
+
+ // check against last actual 'latest_gc_cutoff' first
+ let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
src_timeline
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*latest_gc_cutoff_lsn,
))?;
+
+ // and then the planned GC cutoff
{
let gc_info = src_timeline.gc_info.read().unwrap();
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
@@ -2076,6 +2123,12 @@ impl Tenant {
}
}
+ //
+ // The branch point is valid, and we are still holding the 'gc_cs' lock
+ // so that GC cannot advance the GC cutoff until we are finished.
+ // Proceed with the branch creation.
+ //
+
// Determine prev-LSN for the new timeline. We can only determine it if
// the timeline was branched at the current end of the source timeline.
let RecordLsn {
@@ -2094,7 +2147,7 @@ impl Tenant {
let metadata = TimelineMetadata::new(
start_lsn,
dst_prev,
- Some(src),
+ Some(src_id),
start_lsn,
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
src_timeline.initdb_lsn,
@@ -2103,15 +2156,15 @@ impl Tenant {
let mut timelines = self.timelines.lock().unwrap();
let new_timeline = self
.prepare_timeline(
- dst,
+ dst_id,
metadata,
timeline_uninit_mark,
false,
- Some(src_timeline),
+ Some(Arc::clone(src_timeline)),
)?
.initialize_with_lock(&mut timelines, true, true)?;
drop(timelines);
- info!("branched timeline {dst} from {src} at {start_lsn}");
+ info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
Ok(new_timeline)
}
@@ -2122,6 +2175,7 @@ impl Tenant {
&self,
timeline_id: TimelineId,
pg_version: u32,
+ ctx: &RequestContext,
) -> anyhow::Result> {
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
@@ -2181,6 +2235,7 @@ impl Tenant {
unfinished_timeline,
pgdata_path,
pgdata_lsn,
+ ctx,
)
.await
.with_context(|| {
@@ -2352,7 +2407,10 @@ impl Tenant {
///
/// Future is cancellation safe. Only one calculation can be running at once per tenant.
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
- pub async fn gather_size_inputs(&self) -> anyhow::Result {
+ pub async fn gather_size_inputs(
+ &self,
+ ctx: &RequestContext,
+ ) -> anyhow::Result {
let logical_sizes_at_once = self
.conf
.concurrent_tenant_size_logical_size_queries
@@ -2364,15 +2422,15 @@ impl Tenant {
// See more for on the issue #2748 condenced out of the initial PR review.
let mut shared_cache = self.cached_logical_sizes.lock().await;
- size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
+ size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
}
/// Calculate synthetic tenant size
/// This is periodically called by background worker.
/// result is cached in tenant struct
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
- pub async fn calculate_synthetic_size(&self) -> anyhow::Result {
- let inputs = self.gather_size_inputs().await?;
+ pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result {
+ let inputs = self.gather_size_inputs(ctx).await?;
let size = inputs.calculate()?;
@@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir(
target_tenant_directory,
temporary_tenant_dir,
)
- .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+ .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
let temporary_tenant_config_path = rebase_directory(
&conf.tenant_config_path(tenant_id),
target_tenant_directory,
temporary_tenant_dir,
)
- .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+ .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+
+ Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
- Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
- || {
- format!(
- "Failed to write tenant {} config to {}",
- tenant_id,
- temporary_tenant_config_path.display()
- )
- },
- )?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
- "could not create tenant {} temporary timelines directory {}",
+ "create tenant {} temporary timelines directory {}",
tenant_id,
temporary_tenant_timelines_dir.display()
)
@@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir(
fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
format!(
- "failed to move tenant {} temporary directory {} into the permanent one {}",
+ "move tenant {} temporary directory {} into the permanent one {}",
tenant_id,
temporary_tenant_dir.display(),
target_tenant_directory.display()
@@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir(
})?;
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
format!(
- "Failed to get tenant {} dir parent for {}",
+ "get tenant {} dir parent for {}",
tenant_id,
target_tenant_directory.display()
)
})?;
crashsafe::fsync(target_dir_parent).with_context(|| {
format!(
- "Failed to fsync renamed directory's parent {} for tenant {}",
+ "fsync renamed directory's parent {} for tenant {}",
target_dir_parent.display(),
tenant_id,
)
@@ -2743,11 +2794,17 @@ pub mod harness {
})
}
- pub async fn load(&self) -> Arc {
- self.try_load().await.expect("failed to load test tenant")
+ pub async fn load(&self) -> (Arc, RequestContext) {
+ let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+ (
+ self.try_load(&ctx)
+ .await
+ .expect("failed to load test tenant"),
+ ctx,
+ )
}
- pub async fn try_load(&self) -> anyhow::Result> {
+ pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> {
let walredo_mgr = Arc::new(TestRedoManager);
let tenant = Arc::new(Tenant::new(
@@ -2775,8 +2832,7 @@ pub mod harness {
timelines_to_load.insert(timeline_id, timeline_metadata);
}
// FIXME starts background jobs
- tenant.load().await?;
-
+ tenant.load(ctx).await?;
Ok(tenant)
}
@@ -2833,10 +2889,9 @@ mod tests {
#[tokio::test]
async fn test_basic() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_basic")?.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -2849,15 +2904,15 @@ mod tests {
drop(writer);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x10)).await?,
+ tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
TEST_IMG("foo at 0x10")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+ tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
TEST_IMG("foo at 0x10")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x20)).await?,
+ tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
TEST_IMG("foo at 0x20")
);
@@ -2866,14 +2921,14 @@ mod tests {
#[tokio::test]
async fn no_duplicate_timelines() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("no_duplicate_timelines")?
+ let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
.load()
.await;
- let _ = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let timeline =
+ tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let _ = timeline.initialize(&ctx)?;
- match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
+ match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(
e.to_string(),
@@ -2899,13 +2954,13 @@ mod tests {
///
#[tokio::test]
async fn test_branch() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_branch")?.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
- let writer = tline.writer();
use std::str::from_utf8;
+ let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
+ let writer = tline.writer();
+
#[allow(non_snake_case)]
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
#[allow(non_snake_case)]
@@ -2925,7 +2980,7 @@ mod tests {
// Branch the history, modify relation differently on the new timeline
tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
.await?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
@@ -2936,15 +2991,15 @@ mod tests {
// Check page contents on both branches
assert_eq!(
- from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+ from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
"foo at 0x40"
);
assert_eq!(
- from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+ from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
"bar at 0x40"
);
assert_eq!(
- from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
+ from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
"foobar at 0x20"
);
@@ -2996,13 +3051,12 @@ mod tests {
#[tokio::test]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
- let tenant =
+ let (tenant, ctx) =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
.load()
.await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3010,12 +3064,12 @@ mod tests {
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
tenant
- .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+ .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
.await
{
Ok(_) => panic!("branching should have failed"),
@@ -3034,16 +3088,17 @@ mod tests {
#[tokio::test]
async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
- .load()
- .await;
+ let (tenant, ctx) =
+ TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+ .load()
+ .await;
- tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let tline = tenant
+ .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
+ .initialize(&ctx)?;
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
match tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
.await
{
Ok(_) => panic!("branching should have failed"),
@@ -3085,40 +3140,40 @@ mod tests {
#[tokio::test]
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
- .load()
- .await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) =
+ TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+ .load()
+ .await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
.await?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
tenant
- .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+ .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
- assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
+ assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
Ok(())
}
#[tokio::test]
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
- .load()
- .await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) =
+ TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+ .load()
+ .await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
.await?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
@@ -3128,12 +3183,12 @@ mod tests {
// run gc on parent
tenant
- .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+ .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
// Check that the data is still accessible on the branch.
assert_eq!(
- newtline.get(*TEST_KEY, Lsn(0x50)).await?,
+ newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
);
@@ -3145,14 +3200,14 @@ mod tests {
const TEST_NAME: &str = "timeline_load";
let harness = TenantHarness::create(TEST_NAME)?;
{
- let tenant = harness.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = harness.load().await;
+ let tline =
+ tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
}
- let tenant = harness.load().await;
+ let (tenant, _ctx) = harness.load().await;
tenant
.get_timeline(TIMELINE_ID, true)
.expect("cannot load timeline");
@@ -3166,15 +3221,15 @@ mod tests {
let harness = TenantHarness::create(TEST_NAME)?;
// create two timelines
{
- let tenant = harness.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = harness.load().await;
+ let tline =
+ tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
- .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+ .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
.await?;
let newtline = tenant
@@ -3185,7 +3240,7 @@ mod tests {
}
// check that both of them are initially unloaded
- let tenant = harness.load().await;
+ let (tenant, _ctx) = harness.load().await;
// check that both, child and ancestor are loaded
let _child_tline = tenant
@@ -3203,11 +3258,11 @@ mod tests {
async fn corrupt_metadata() -> anyhow::Result<()> {
const TEST_NAME: &str = "corrupt_metadata";
let harness = TenantHarness::create(TEST_NAME)?;
- let tenant = harness.load().await;
+ let (tenant, ctx) = harness.load().await;
tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+ .initialize(&ctx)?;
drop(tenant);
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3219,7 +3274,7 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
- let err = harness.try_load().await.err().expect("should fail");
+ let err = harness.try_load(&ctx).await.err().expect("should fail");
assert!(err
.to_string()
.starts_with("Failed to parse metadata bytes from path"));
@@ -3243,10 +3298,9 @@ mod tests {
#[tokio::test]
async fn test_images() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_images")?.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3254,7 +3308,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
@@ -3262,7 +3316,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
@@ -3270,7 +3324,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
@@ -3278,26 +3332,26 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x10)).await?,
+ tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
TEST_IMG("foo at 0x10")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+ tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
TEST_IMG("foo at 0x10")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x20)).await?,
+ tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
TEST_IMG("foo at 0x20")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x30)).await?,
+ tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
TEST_IMG("foo at 0x30")
);
assert_eq!(
- tline.get(*TEST_KEY, Lsn(0x40)).await?,
+ tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
TEST_IMG("foo at 0x40")
);
@@ -3310,10 +3364,9 @@ mod tests {
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
let mut lsn = Lsn(0x10);
@@ -3342,10 +3395,10 @@ mod tests {
let cutoff = tline.get_last_record_lsn();
tline
- .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+ .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
tline.gc().await?;
}
@@ -3354,10 +3407,9 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_random_updates")?.load().await;
- let tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+ let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+ let tline = tline.initialize(&ctx)?;
const NUM_KEYS: usize = 1000;
@@ -3407,7 +3459,7 @@ mod tests {
for (blknum, last_lsn) in updated.iter().enumerate() {
test_key.field6 = blknum as u32;
assert_eq!(
- tline.get(test_key, lsn).await?,
+ tline.get(test_key, lsn, &ctx).await?,
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
);
}
@@ -3415,10 +3467,10 @@ mod tests {
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
- .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+ .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
tline.gc().await?;
}
@@ -3427,12 +3479,12 @@ mod tests {
#[tokio::test]
async fn test_traverse_branches() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_traverse_branches")?
+ let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
.load()
.await;
let mut tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+ .initialize(&ctx)?;
const NUM_KEYS: usize = 1000;
@@ -3462,16 +3514,14 @@ mod tests {
keyspace.add_key(test_key);
}
- let mut tline_id = TIMELINE_ID;
for _ in 0..50 {
let new_tline_id = TimelineId::generate();
tenant
- .branch_timeline(tline_id, new_tline_id, Some(lsn))
+ .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
.await?;
tline = tenant
.get_timeline(new_tline_id, true)
.expect("Should have the branched timeline");
- tline_id = new_tline_id;
for _ in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
@@ -3493,7 +3543,7 @@ mod tests {
for (blknum, last_lsn) in updated.iter().enumerate() {
test_key.field6 = blknum as u32;
assert_eq!(
- tline.get(test_key, lsn).await?,
+ tline.get(test_key, lsn, &ctx).await?,
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
);
}
@@ -3501,10 +3551,10 @@ mod tests {
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
- .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+ .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
- tline.compact().await?;
+ tline.compact(&ctx).await?;
tline.gc().await?;
}
@@ -3513,12 +3563,12 @@ mod tests {
#[tokio::test]
async fn test_traverse_ancestors() -> anyhow::Result<()> {
- let tenant = TenantHarness::create("test_traverse_ancestors")?
+ let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
.load()
.await;
let mut tline = tenant
- .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
- .initialize()?;
+ .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+ .initialize(&ctx)?;
const NUM_KEYS: usize = 100;
const NUM_TLINES: usize = 50;
@@ -3528,18 +3578,16 @@ mod tests {
let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
let mut lsn = Lsn(0);
- let mut tline_id = TIMELINE_ID;
#[allow(clippy::needless_range_loop)]
for idx in 0..NUM_TLINES {
let new_tline_id = TimelineId::generate();
tenant
- .branch_timeline(tline_id, new_tline_id, Some(lsn))
+ .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
.await?;
tline = tenant
.get_timeline(new_tline_id, true)
.expect("Should have the branched timeline");
- tline_id = new_tline_id;
for _ in 0..NUM_KEYS {
lsn = Lsn(lsn.0 + 0x10);
@@ -3568,7 +3616,7 @@ mod tests {
println!("checking [{idx}][{blknum}] at {lsn}");
test_key.field6 = blknum as u32;
assert_eq!(
- tline.get(test_key, *lsn).await?,
+ tline.get(test_key, *lsn, &ctx).await?,
TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
);
}
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c95a98fbc7..e66ee0ae36 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -28,7 +28,12 @@ pub mod defaults {
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
- pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+ // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+ // If there's a need to decrease this value, first make sure that GC
+ // doesn't hold a layer map write lock for non-trivial operations.
+ // Relevant: https://github.com/neondatabase/neon/issues/3394
+ pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 01c5359e88..ed1a32c8fd 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -9,24 +9,57 @@
//! are frozen, and it is split up into new image and delta layers and the
//! corresponding files are written to disk.
//!
+//! Design overview:
+//!
+//! The `search` method of the layer map is on the read critical path, so we've
+//! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
+//! Other read methods are less critical but still impact performance of background tasks.
+//!
+//! This data structure relies on a persistent/immutable binary search tree. See the
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
+//! you to modify the tree in such a way that each modification creates a new "version"
+//! of the tree. When you modify it, you get a new version, but all previous versions are
+//! still accessible too. So if someone is still holding a reference to an older version,
+//! they continue to see the tree as it was then. The persistent BST stores all the
+//! different versions in an efficient way.
+//!
+//! Our persistent BST maintains a map of which layer file "covers" each key. It has only
+//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
+//! to handle the LSN dimension.
+//!
+//! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
+//! starting from the oldest one. After each insertion, we grab a reference to that "version"
+//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
+//! `historic_layer_coverage.rs`.
+//!
+//! To search for a particular key-LSN pair, you first look up the right "version" in the
+//! BTreeMap. Then you search that version of the BST with the key.
+//!
+//! The persistent BST keeps all the versions, but there is no way to change the old versions
+//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
+//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
+//! to throw away most of the persistent BST and build a new one, starting from the oldest
+//! LSN. See `LayerMap::flush_updates()`.
+//!
+mod historic_layer_coverage;
+mod layer_coverage;
+
+use crate::keyspace::KeyPartitioning;
use crate::metrics::NUM_ONDISK_LAYERS;
use crate::repository::Key;
-use crate::tenant::storage_layer::{range_eq, range_overlaps};
-use amplify_num::i256;
+use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::Layer;
use anyhow::Result;
-use num_traits::identities::{One, Zero};
-use num_traits::{Bounded, Num, Signed};
-use rstar::{RTree, RTreeObject, AABB};
-use std::cmp::Ordering;
use std::collections::VecDeque;
use std::ops::Range;
-use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
use std::sync::Arc;
-use tracing::*;
use utils::lsn::Lsn;
-use super::storage_layer::{InMemoryLayer, Layer};
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+
+use super::storage_layer::range_eq;
///
/// LayerMap tracks what layers exist on a timeline.
@@ -51,8 +84,8 @@ pub struct LayerMap {
///
pub frozen_layers: VecDeque>,
- /// All the historic layers are kept here
- historic_layers: RTree>,
+ /// Index of the historic layers optimized for search
+ historic: BufferedHistoricLayerCoverage>,
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
@@ -65,177 +98,64 @@ impl Default for LayerMap {
open_layer: None,
next_open_layer_at: None,
frozen_layers: VecDeque::default(),
- historic_layers: RTree::default(),
l0_delta_layers: Vec::default(),
+ historic: BufferedHistoricLayerCoverage::default(),
}
}
}
-struct LayerRTreeObject {
- layer: Arc,
-
- envelope: AABB<[IntKey; 2]>,
+/// The primary update API for the layer map.
+///
+/// Batching historic layer insertions and removals is good for
+/// performance and this struct helps us do that correctly.
+#[must_use]
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+ // While we hold this exclusive reference to the layer map the type checker
+ // will prevent us from accidentally reading any unflushed updates.
+ layer_map: &'a mut LayerMap,
}
-// Representation of Key as numeric type.
-// We can not use native implementation of i128, because rstar::RTree
-// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
-// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
-// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
-// By using i256 as the type, even though all the actual values would fit in i128, we can be
-// sure that multiplication doesn't overflow.
-//
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
-struct IntKey(i256);
-
-impl Copy for IntKey {}
-
-impl IntKey {
- fn from(i: i128) -> Self {
- IntKey(i256::from(i))
- }
-}
-
-impl Bounded for IntKey {
- fn min_value() -> Self {
- IntKey(i256::MIN)
- }
- fn max_value() -> Self {
- IntKey(i256::MAX)
- }
-}
-
-impl Signed for IntKey {
- fn is_positive(&self) -> bool {
- self.0 > i256::ZERO
- }
- fn is_negative(&self) -> bool {
- self.0 < i256::ZERO
- }
- fn signum(&self) -> Self {
- match self.0.cmp(&i256::ZERO) {
- Ordering::Greater => IntKey(i256::ONE),
- Ordering::Less => IntKey(-i256::ONE),
- Ordering::Equal => IntKey(i256::ZERO),
- }
- }
- fn abs(&self) -> Self {
- IntKey(self.0.abs())
- }
- fn abs_sub(&self, other: &Self) -> Self {
- if self.0 <= other.0 {
- IntKey(i256::ZERO)
- } else {
- IntKey(self.0 - other.0)
- }
- }
-}
-
-impl Neg for IntKey {
- type Output = Self;
- fn neg(self) -> Self::Output {
- IntKey(-self.0)
- }
-}
-
-impl Rem for IntKey {
- type Output = Self;
- fn rem(self, rhs: Self) -> Self::Output {
- IntKey(self.0 % rhs.0)
- }
-}
-
-impl Div for IntKey {
- type Output = Self;
- fn div(self, rhs: Self) -> Self::Output {
- IntKey(self.0 / rhs.0)
- }
-}
-
-impl Add for IntKey {
- type Output = Self;
- fn add(self, rhs: Self) -> Self::Output {
- IntKey(self.0 + rhs.0)
- }
-}
-
-impl Sub for IntKey {
- type Output = Self;
- fn sub(self, rhs: Self) -> Self::Output {
- IntKey(self.0 - rhs.0)
- }
-}
-
-impl Mul for IntKey {
- type Output = Self;
- fn mul(self, rhs: Self) -> Self::Output {
- IntKey(self.0 * rhs.0)
- }
-}
-
-impl One for IntKey {
- fn one() -> Self {
- IntKey(i256::ONE)
- }
-}
-
-impl Zero for IntKey {
- fn zero() -> Self {
- IntKey(i256::ZERO)
- }
- fn is_zero(&self) -> bool {
- self.0 == i256::ZERO
- }
-}
-
-impl Num for IntKey {
- type FromStrRadixErr = ::FromStrRadixErr;
- fn from_str_radix(str: &str, radix: u32) -> Result {
- Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
- }
-}
-
-impl PartialEq for LayerRTreeObject {
- fn eq(&self, other: &Self) -> bool {
- // FIXME: ptr_eq might fail to return true for 'dyn'
- // references. Clippy complains about this. In practice it
- // seems to work, the assertion below would be triggered
- // otherwise but this ought to be fixed.
- #[allow(clippy::vtable_address_comparisons)]
- Arc::ptr_eq(&self.layer, &other.layer)
- }
-}
-
-impl RTreeObject for LayerRTreeObject
-where
- L: ?Sized,
-{
- type Envelope = AABB<[IntKey; 2]>;
- fn envelope(&self) -> Self::Envelope {
- self.envelope
- }
-}
-
-impl LayerRTreeObject
+/// Provide ability to batch more updates while hiding the read
+/// API so we don't accidentally read without flushing.
+impl BatchedUpdates<'_, L>
where
L: ?Sized + Layer,
{
- fn new(layer: Arc) -> Self {
- let key_range = layer.get_key_range();
- let lsn_range = layer.get_lsn_range();
+ ///
+ /// Insert an on-disk layer.
+ ///
+ pub fn insert_historic(&mut self, layer: Arc) {
+ self.layer_map.insert_historic_noflush(layer)
+ }
- let envelope = AABB::from_corners(
- [
- IntKey::from(key_range.start.to_i128()),
- IntKey::from(lsn_range.start.0 as i128),
- ],
- [
- IntKey::from(key_range.end.to_i128() - 1),
- IntKey::from(lsn_range.end.0 as i128 - 1),
- ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
- );
- LayerRTreeObject { layer, envelope }
+ ///
+ /// Remove an on-disk layer from the map.
+ ///
+ /// This should be called when the corresponding file on disk has been deleted.
+ ///
+ pub fn remove_historic(&mut self, layer: Arc) {
+ self.layer_map.remove_historic_noflush(layer)
+ }
+
+ // We will flush on drop anyway, but this method makes it
+ // more explicit that there is some work being done.
+ /// Apply all updates
+ pub fn flush(self) {
+ // Flush happens on drop
+ }
+}
+
+// Ideally the flush() method should be called explicitly for more
+// controlled execution. But if we forget we'd rather flush on drop
+// than panic later or read without flushing.
+//
+// TODO maybe warn if flush hasn't explicitly been called
+impl Drop for BatchedUpdates<'_, L>
+where
+ L: ?Sized + Layer,
+{
+ fn drop(&mut self) {
+ self.layer_map.flush_updates();
}
}
@@ -281,125 +201,91 @@ where
/// 'open' and 'frozen' layers!
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> {
- // Find the latest image layer that covers the given key
- let mut latest_img: Option> = None;
- let mut latest_img_lsn: Option = None;
- let envelope = AABB::from_corners(
- [IntKey::from(key.to_i128()), IntKey::from(0i128)],
- [
- IntKey::from(key.to_i128()),
- IntKey::from(end_lsn.0 as i128 - 1),
- ],
- );
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- if l.is_incremental() {
- continue;
- }
- assert!(l.get_key_range().contains(&key));
- let img_lsn = l.get_lsn_range().start;
- assert!(img_lsn < end_lsn);
- if Lsn(img_lsn.0 + 1) == end_lsn {
- // found exact match
- return Some(SearchResult {
- layer: Arc::clone(l),
- lsn_floor: img_lsn,
- });
- }
- if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
- latest_img = Some(Arc::clone(l));
- latest_img_lsn = Some(img_lsn);
- }
- }
+ let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+ let latest_delta = version.delta_coverage.query(key.to_i128());
+ let latest_image = version.image_coverage.query(key.to_i128());
- // Search the delta layers
- let mut latest_delta: Option> = None;
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- if !l.is_incremental() {
- continue;
+ match (latest_delta, latest_image) {
+ (None, None) => None,
+ (None, Some(image)) => {
+ let lsn_floor = image.get_lsn_range().start;
+ Some(SearchResult {
+ layer: image,
+ lsn_floor,
+ })
}
- assert!(l.get_key_range().contains(&key));
- if l.get_lsn_range().start >= end_lsn {
- info!(
- "Candidate delta layer {}..{} is too new for lsn {}",
- l.get_lsn_range().start,
- l.get_lsn_range().end,
- end_lsn
- );
+ (Some(delta), None) => {
+ let lsn_floor = delta.get_lsn_range().start;
+ Some(SearchResult {
+ layer: delta,
+ lsn_floor,
+ })
}
- assert!(l.get_lsn_range().start < end_lsn);
- if l.get_lsn_range().end >= end_lsn {
- // this layer contains the requested point in the key/lsn space.
- // No need to search any further
- trace!(
- "found layer {} for request on {key} at {end_lsn}",
- l.short_id(),
- );
- latest_delta.replace(Arc::clone(l));
- break;
- }
- if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
- // this layer's end LSN is smaller than the requested point. If there's
- // nothing newer, this is what we need to return. Remember this.
- if let Some(old_candidate) = &latest_delta {
- if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
- latest_delta.replace(Arc::clone(l));
- }
+ (Some(delta), Some(image)) => {
+ let img_lsn = image.get_lsn_range().start;
+ let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
+ let image_exact_match = img_lsn + 1 == end_lsn;
+ if image_is_newer || image_exact_match {
+ Some(SearchResult {
+ layer: image,
+ lsn_floor: img_lsn,
+ })
} else {
- latest_delta.replace(Arc::clone(l));
+ let lsn_floor =
+ std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+ Some(SearchResult {
+ layer: delta,
+ lsn_floor,
+ })
}
}
}
- if let Some(l) = latest_delta {
- trace!(
- "found (old) layer {} for request on {key} at {end_lsn}",
- l.short_id(),
- );
- let lsn_floor = std::cmp::max(
- Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
- l.get_lsn_range().start,
- );
- Some(SearchResult {
- lsn_floor,
- layer: l,
- })
- } else if let Some(l) = latest_img {
- trace!("found img layer and no deltas for request on {key} at {end_lsn}");
- Some(SearchResult {
- lsn_floor: latest_img_lsn.unwrap(),
- layer: l,
- })
- } else {
- trace!("no layer found for request on {key} at {end_lsn}");
- None
- }
+ }
+
+ /// Start a batch of updates, applied on drop
+ pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+ BatchedUpdates { layer_map: self }
}
///
/// Insert an on-disk layer
///
- pub fn insert_historic(&mut self, layer: Arc) {
- if layer.get_key_range() == (Key::MIN..Key::MAX) {
- self.l0_delta_layers.push(layer.clone());
+ /// Helper function for BatchedUpdates::insert_historic
+ ///
+ pub(self) fn insert_historic_noflush(&mut self, layer: Arc) {
+ let kr = layer.get_key_range();
+ let lr = layer.get_lsn_range();
+ self.historic.insert(
+ historic_layer_coverage::LayerKey {
+ key: kr.start.to_i128()..kr.end.to_i128(),
+ lsn: lr.start.0..lr.end.0,
+ is_image: !layer.is_incremental(),
+ },
+ Arc::clone(&layer),
+ );
+
+ if Self::is_l0(&layer) {
+ self.l0_delta_layers.push(layer);
}
- self.historic_layers.insert(LayerRTreeObject::new(layer));
+
NUM_ONDISK_LAYERS.inc();
}
///
/// Remove an on-disk layer from the map.
///
- /// This should be called when the corresponding file on disk has been deleted.
+ /// Helper function for BatchedUpdates::remove_historic
///
- pub fn remove_historic(&mut self, layer: Arc) {
- if layer.get_key_range() == (Key::MIN..Key::MAX) {
+ pub fn remove_historic_noflush(&mut self, layer: Arc) {
+ let kr = layer.get_key_range();
+ let lr = layer.get_lsn_range();
+ self.historic.remove(historic_layer_coverage::LayerKey {
+ key: kr.start.to_i128()..kr.end.to_i128(),
+ lsn: lr.start.0..lr.end.0,
+ is_image: !layer.is_incremental(),
+ });
+
+ if Self::is_l0(&layer) {
let len_before = self.l0_delta_layers.len();
// FIXME: ptr_eq might fail to return true for 'dyn'
@@ -411,98 +297,57 @@ where
.retain(|other| !Arc::ptr_eq(other, &layer));
assert_eq!(self.l0_delta_layers.len(), len_before - 1);
}
- assert!(self
- .historic_layers
- .remove(&LayerRTreeObject::new(layer))
- .is_some());
+
NUM_ONDISK_LAYERS.dec();
}
+ /// Helper function for BatchedUpdates::drop.
+ pub(self) fn flush_updates(&mut self) {
+ self.historic.rebuild();
+ }
+
/// Is there a newer image layer for given key- and LSN-range? Or a set
/// of image layers within the specified lsn range that cover the entire
/// specified key range?
///
/// This is used for garbage collection, to determine if an old layer can
/// be deleted.
- pub fn image_layer_exists(
- &self,
- key_range: &Range,
- lsn_range: &Range,
- ) -> Result {
- let mut range_remain = key_range.clone();
+ pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> Result {
+ if key.is_empty() {
+ // Vacuously true. There's a newer image for all 0 of the kerys in the range.
+ return Ok(true);
+ }
- loop {
- let mut made_progress = false;
- let envelope = AABB::from_corners(
- [
- IntKey::from(range_remain.start.to_i128()),
- IntKey::from(lsn_range.start.0 as i128),
- ],
- [
- IntKey::from(range_remain.end.to_i128() - 1),
- IntKey::from(lsn_range.end.0 as i128 - 1),
- ],
- );
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- if l.is_incremental() {
- continue;
- }
- let img_lsn = l.get_lsn_range().start;
- if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
- made_progress = true;
- let img_key_end = l.get_key_range().end;
+ let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+ Some(v) => v,
+ None => return Ok(false),
+ };
- if img_key_end >= range_remain.end {
- return Ok(true);
- }
- range_remain.start = img_key_end;
- }
- }
+ let start = key.start.to_i128();
+ let end = key.end.to_i128();
- if !made_progress {
+ let layer_covers = |layer: Option>| match layer {
+ Some(layer) => layer.get_lsn_range().start >= lsn.start,
+ None => false,
+ };
+
+ // Check the start is covered
+ if !layer_covers(version.image_coverage.query(start)) {
+ return Ok(false);
+ }
+
+ // Check after all changes of coverage
+ for (_, change_val) in version.image_coverage.range(start..end) {
+ if !layer_covers(change_val) {
return Ok(false);
}
}
+
+ Ok(true)
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator- > {
- self.historic_layers.iter().map(|e| e.layer.clone())
- }
-
- /// Find the last image layer that covers 'key', ignoring any image layers
- /// newer than 'lsn'.
- fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option
> {
- let mut candidate_lsn = Lsn(0);
- let mut candidate = None;
- let envelope = AABB::from_corners(
- [IntKey::from(key.to_i128()), IntKey::from(0)],
- [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
- );
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- if l.is_incremental() {
- continue;
- }
-
- assert!(l.get_key_range().contains(&key));
- let this_lsn = l.get_lsn_range().start;
- assert!(this_lsn <= lsn);
- if this_lsn < candidate_lsn {
- // our previous candidate was better
- continue;
- }
- candidate_lsn = this_lsn;
- candidate = Some(Arc::clone(l));
- }
-
- candidate
+ self.historic.iter()
}
///
@@ -518,94 +363,288 @@ where
key_range: &Range,
lsn: Lsn,
) -> Result, Option>)>> {
- let mut points = vec![key_range.start];
- let envelope = AABB::from_corners(
- [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
- [
- IntKey::from(key_range.end.to_i128()),
- IntKey::from(lsn.0 as i128),
- ],
- );
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- assert!(l.get_lsn_range().start <= lsn);
- let range = l.get_key_range();
- if key_range.contains(&range.start) {
- points.push(l.get_key_range().start);
- }
- if key_range.contains(&range.end) {
- points.push(l.get_key_range().end);
- }
+ let version = match self.historic.get().unwrap().get_version(lsn.0) {
+ Some(v) => v,
+ None => return Ok(vec![]),
+ };
+
+ let start = key_range.start.to_i128();
+ let end = key_range.end.to_i128();
+
+ // Initialize loop variables
+ let mut coverage: Vec<(Range, Option>)> = vec![];
+ let mut current_key = start;
+ let mut current_val = version.image_coverage.query(start);
+
+ // Loop through the change events and push intervals
+ for (change_key, change_val) in version.image_coverage.range(start..end) {
+ let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+ coverage.push((kr, current_val.take()));
+ current_key = change_key;
+ current_val = change_val.clone();
}
- points.push(key_range.end);
- points.sort();
- points.dedup();
+ // Add the final interval
+ let kr = Key::from_i128(current_key)..Key::from_i128(end);
+ coverage.push((kr, current_val.take()));
- // Ok, we now have a list of "interesting" points in the key space
-
- // For each range between the points, find the latest image
- let mut start = *points.first().unwrap();
- let mut ranges = Vec::new();
- for end in points[1..].iter() {
- let img = self.find_latest_image(start, lsn);
-
- ranges.push((start..*end, img));
-
- start = *end;
- }
- Ok(ranges)
+ Ok(coverage)
}
- /// Count the height of the tallest stack of deltas in this 2d region.
+ pub fn is_l0(layer: &L) -> bool {
+ range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+ }
+
+ /// This function determines which layers are counted in `count_deltas`:
+ /// layers that should count towards deciding whether or not to reimage
+ /// a certain partition range.
+ ///
+ /// There are two kinds of layers we currently consider reimage-worthy:
+ ///
+ /// Case 1: Non-L0 layers are currently reimage-worthy by default.
+ /// TODO Some of these layers are very sparse and cover the entire key
+ /// range. Replacing 256MB of data (or less!) with terabytes of
+ /// images doesn't seem wise. We need a better heuristic, possibly
+ /// based on some of these factors:
+ /// a) whether this layer has any wal in this partition range
+ /// b) the size of the layer
+ /// c) the number of images needed to cover it
+ /// d) the estimated time until we'll have to reimage over it for GC
+ ///
+ /// Case 2: Since L0 layers by definition cover the entire key space, we consider
+ /// them reimage-worthy only when the entire key space can be covered by very few
+ /// images (currently 1).
+ /// TODO The optimal number should probably be slightly higher than 1, but to
+ /// implement that we need to plumb a lot more context into this function
+ /// than just the current partition_range.
+ pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> bool {
+ // Case 1
+ if !Self::is_l0(layer) {
+ return true;
+ }
+
+ // Case 2
+ if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+ return true;
+ }
+
+ false
+ }
+
+ /// Count the height of the tallest stack of reimage-worthy deltas
+ /// in this 2d region.
+ ///
+ /// If `limit` is provided we don't try to count above that number.
///
/// This number is used to compute the largest number of deltas that
/// we'll need to visit for any page reconstruction in this region.
/// We use this heuristic to decide whether to create an image layer.
- ///
- /// TODO currently we just return the total number of deltas in the
- /// region, no matter if they're stacked on top of each other
- /// or next to each other.
- pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result {
- let mut result = 0;
- if lsn_range.start >= lsn_range.end {
+ pub fn count_deltas(
+ &self,
+ key: &Range,
+ lsn: &Range,
+ limit: Option,
+ ) -> Result {
+ // We get the delta coverage of the region, and for each part of the coverage
+ // we recurse right underneath the delta. The recursion depth is limited by
+ // the largest result this function could return, which is in practice between
+ // 3 and 10 (since we usually try to create an image when the number gets larger).
+
+ if lsn.is_empty() || key.is_empty() || limit == Some(0) {
return Ok(0);
}
- let envelope = AABB::from_corners(
- [
- IntKey::from(key_range.start.to_i128()),
- IntKey::from(lsn_range.start.0 as i128),
- ],
- [
- IntKey::from(key_range.end.to_i128() - 1),
- IntKey::from(lsn_range.end.0 as i128 - 1),
- ],
- );
- for e in self
- .historic_layers
- .locate_in_envelope_intersecting(&envelope)
- {
- let l = &e.layer;
- if !l.is_incremental() {
- continue;
- }
- assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
- assert!(range_overlaps(&l.get_key_range(), key_range));
- // We ignore level0 delta layers. Unless the whole keyspace fits
- // into one partition
- if !range_eq(key_range, &(Key::MIN..Key::MAX))
- && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
- {
- continue;
+ let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+ Some(v) => v,
+ None => return Ok(0),
+ };
+
+ let start = key.start.to_i128();
+ let end = key.end.to_i128();
+
+ // Initialize loop variables
+ let mut max_stacked_deltas = 0;
+ let mut current_key = start;
+ let mut current_val = version.delta_coverage.query(start);
+
+ // Loop through the delta coverage and recurse on each part
+ for (change_key, change_val) in version.delta_coverage.range(start..end) {
+ // If there's a relevant delta in this part, add 1 and recurse down
+ if let Some(val) = current_val {
+ if val.get_lsn_range().end > lsn.start {
+ let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+ let lr = lsn.start..val.get_lsn_range().start;
+ if !kr.is_empty() {
+ let base_count = Self::is_reimage_worthy(&val, key) as usize;
+ let new_limit = limit.map(|l| l - base_count);
+ let max_stacked_deltas_underneath =
+ self.count_deltas(&kr, &lr, new_limit)?;
+ max_stacked_deltas = std::cmp::max(
+ max_stacked_deltas,
+ base_count + max_stacked_deltas_underneath,
+ );
+ }
+ }
}
- result += 1;
+ current_key = change_key;
+ current_val = change_val.clone();
}
- Ok(result)
+
+ // Consider the last part
+ if let Some(val) = current_val {
+ if val.get_lsn_range().end > lsn.start {
+ let kr = Key::from_i128(current_key)..Key::from_i128(end);
+ let lr = lsn.start..val.get_lsn_range().start;
+
+ if !kr.is_empty() {
+ let base_count = Self::is_reimage_worthy(&val, key) as usize;
+ let new_limit = limit.map(|l| l - base_count);
+ let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+ max_stacked_deltas = std::cmp::max(
+ max_stacked_deltas,
+ base_count + max_stacked_deltas_underneath,
+ );
+ }
+ }
+ }
+
+ Ok(max_stacked_deltas)
+ }
+
+ /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
+ ///
+ /// The `partition_range` argument is used as context for the reimage-worthiness decision.
+ ///
+ /// Used as a helper for correctness checks only. Performance not critical.
+ pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize {
+ match self.search(key, lsn) {
+ Some(search_result) => {
+ if search_result.layer.is_incremental() {
+ (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
+ + self.get_difficulty(search_result.lsn_floor, key, partition_range)
+ } else {
+ 0
+ }
+ }
+ None => 0,
+ }
+ }
+
+ /// Used for correctness checking. Results are expected to be identical to
+ /// self.get_difficulty_map. Assumes self.search is correct.
+ pub fn get_difficulty_map_bruteforce(
+ &self,
+ lsn: Lsn,
+ partitioning: &KeyPartitioning,
+ ) -> Vec {
+ // Looking at the difficulty as a function of key, it could only increase
+ // when a delta layer starts or an image layer ends. Therefore it's sufficient
+ // to check the difficulties at:
+ // - the key.start for each non-empty part range
+ // - the key.start for each delta
+ // - the key.end for each image
+ let keys_iter: Box> = {
+ let mut keys: Vec = self
+ .iter_historic_layers()
+ .map(|layer| {
+ if layer.is_incremental() {
+ layer.get_key_range().start
+ } else {
+ layer.get_key_range().end
+ }
+ })
+ .collect();
+ keys.sort();
+ Box::new(keys.into_iter())
+ };
+ let mut keys_iter = keys_iter.peekable();
+
+ // Iter the partition and keys together and query all the necessary
+ // keys, computing the max difficulty for each part.
+ partitioning
+ .parts
+ .iter()
+ .map(|part| {
+ let mut difficulty = 0;
+ // Partition ranges are assumed to be sorted and disjoint
+ // TODO assert it
+ for range in &part.ranges {
+ if !range.is_empty() {
+ difficulty =
+ std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
+ }
+ while let Some(key) = keys_iter.peek() {
+ if key >= &range.end {
+ break;
+ }
+ let key = keys_iter.next().unwrap();
+ if key < range.start {
+ continue;
+ }
+ difficulty =
+ std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
+ }
+ }
+ difficulty
+ })
+ .collect()
+ }
+
+ /// For each part of a keyspace partitioning, return the maximum number of layers
+ /// that would be needed for page reconstruction in that part at the given LSN.
+ ///
+ /// If `limit` is provided we don't try to count above that number.
+ ///
+ /// This method is used to decide where to create new image layers. Computing the
+ /// result for the entire partitioning at once allows this function to be more
+ /// efficient, and further optimization is possible by using iterators instead,
+ /// to allow early return.
+ ///
+ /// TODO actually use this method instead of count_deltas. Currently we only use
+ /// it for benchmarks.
+ pub fn get_difficulty_map(
+ &self,
+ lsn: Lsn,
+ partitioning: &KeyPartitioning,
+ limit: Option,
+ ) -> Vec {
+ // TODO This is a naive implementation. Perf improvements to do:
+ // 1. Instead of calling self.image_coverage and self.count_deltas,
+ // iterate the image and delta coverage only once.
+ partitioning
+ .parts
+ .iter()
+ .map(|part| {
+ let mut difficulty = 0;
+ for range in &part.ranges {
+ if limit == Some(difficulty) {
+ break;
+ }
+ for (img_range, last_img) in self
+ .image_coverage(range, lsn)
+ .expect("why would this err?")
+ {
+ if limit == Some(difficulty) {
+ break;
+ }
+ let img_lsn = if let Some(last_img) = last_img {
+ last_img.get_lsn_range().end
+ } else {
+ Lsn(0)
+ };
+
+ if img_lsn < lsn {
+ let num_deltas = self
+ .count_deltas(&img_range, &(img_lsn..lsn), limit)
+ .expect("why would this err lol?");
+ difficulty = std::cmp::max(difficulty, num_deltas);
+ }
+ }
+ }
+ difficulty
+ })
+ .collect()
}
/// Return all L0 delta layers
@@ -629,8 +668,8 @@ where
}
println!("historic_layers:");
- for e in self.historic_layers.iter() {
- e.layer.dump(verbose)?;
+ for layer in self.iter_historic_layers() {
+ layer.dump(verbose)?;
}
println!("End dump LayerMap");
Ok(())
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
new file mode 100644
index 0000000000..46821aef15
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+ // TODO I use i128 and u64 because it was easy for prototyping,
+ // testing, and benchmarking. If we can use the Lsn and Key
+ // types without overhead that would be preferable.
+ pub key: Range,
+ pub lsn: Range,
+ pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+ fn partial_cmp(&self, other: &Self) -> Option {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for LayerKey {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ // NOTE we really care about comparing by lsn.start first
+ self.lsn
+ .start
+ .cmp(&other.lsn.start)
+ .then(self.lsn.end.cmp(&other.lsn.end))
+ .then(self.key.start.cmp(&other.key.start))
+ .then(self.key.end.cmp(&other.key.end))
+ .then(self.is_image.cmp(&other.is_image))
+ }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage {
+ /// The latest state
+ head: LayerCoverageTuple,
+
+ /// All previous states
+ historic: BTreeMap>,
+}
+
+impl Default for HistoricLayerCoverage {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl HistoricLayerCoverage {
+ pub fn new() -> Self {
+ Self {
+ head: LayerCoverageTuple::default(),
+ historic: BTreeMap::default(),
+ }
+ }
+
+ /// Add a layer
+ ///
+ /// Panics if new layer has older lsn.start than an existing layer.
+ /// See BufferedHistoricLayerCoverage for a more general insertion method.
+ pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+ // It's only a persistent map, not a retroactive one
+ if let Some(last_entry) = self.historic.iter().next_back() {
+ let last_lsn = last_entry.0;
+ if layer_key.lsn.start < *last_lsn {
+ panic!("unexpected retroactive insert");
+ }
+ }
+
+ // Insert into data structure
+ if layer_key.is_image {
+ self.head
+ .image_coverage
+ .insert(layer_key.key, layer_key.lsn.clone(), value);
+ } else {
+ self.head
+ .delta_coverage
+ .insert(layer_key.key, layer_key.lsn.clone(), value);
+ }
+
+ // Remember history. Clone is O(1)
+ self.historic.insert(layer_key.lsn.start, self.head.clone());
+ }
+
+ /// Query at a particular LSN, inclusive
+ pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple> {
+ match self.historic.range(..=lsn).next_back() {
+ Some((_, v)) => Some(v),
+ None => None,
+ }
+ }
+
+ /// Remove all entries after a certain LSN (inclusive)
+ pub fn trim(&mut self, begin: &u64) {
+ self.historic.split_off(begin);
+ self.head = self
+ .historic
+ .iter()
+ .rev()
+ .next()
+ .map(|(_, v)| v.clone())
+ .unwrap_or_default();
+ }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+ let mut map = HistoricLayerCoverage::::new();
+ map.insert(
+ LayerKey {
+ key: 0..5,
+ lsn: 100..101,
+ is_image: true,
+ },
+ "Layer 1".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 3..9,
+ lsn: 110..111,
+ is_image: true,
+ },
+ "Layer 2".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 5..6,
+ lsn: 120..121,
+ is_image: true,
+ },
+ "Layer 3".to_string(),
+ );
+
+ // After Layer 1 insertion
+ let version = map.get_version(105).unwrap();
+ assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+ assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+ // After Layer 2 insertion
+ let version = map.get_version(115).unwrap();
+ assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+ assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+ assert_eq!(version.image_coverage.query(11), None);
+
+ // After Layer 3 insertion
+ let version = map.get_version(125).unwrap();
+ assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+ assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+ assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+ let mut map = HistoricLayerCoverage::::new();
+ map.insert(
+ LayerKey {
+ key: 3..5,
+ lsn: 100..110,
+ is_image: true,
+ },
+ "Layer 1".to_string(),
+ );
+
+ // Check different LSNs
+ let version = map.get_version(99);
+ assert!(version.is_none());
+ let version = map.get_version(100).unwrap();
+ assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+ let version = map.get_version(110).unwrap();
+ assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+ // Check different keys
+ let version = map.get_version(105).unwrap();
+ assert_eq!(version.image_coverage.query(2), None);
+ assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+ assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+ assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+ let mut map = HistoricLayerCoverage::::new();
+
+ map.insert(
+ LayerKey {
+ key: 3..5,
+ lsn: 100..110,
+ is_image: true,
+ },
+ "Layer 10".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 5..8,
+ lsn: 100..110,
+ is_image: true,
+ },
+ "Layer 11".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 3..4,
+ lsn: 200..210,
+ is_image: true,
+ },
+ "Layer 20".to_string(),
+ );
+
+ // Check after layer 11
+ let version = map.get_version(105).unwrap();
+ assert_eq!(version.image_coverage.query(2), None);
+ assert_eq!(
+ version.image_coverage.query(3),
+ Some("Layer 10".to_string())
+ );
+ assert_eq!(
+ version.image_coverage.query(5),
+ Some("Layer 11".to_string())
+ );
+ assert_eq!(
+ version.image_coverage.query(7),
+ Some("Layer 11".to_string())
+ );
+ assert_eq!(version.image_coverage.query(8), None);
+
+ // Check after layer 20
+ let version = map.get_version(205).unwrap();
+ assert_eq!(version.image_coverage.query(2), None);
+ assert_eq!(
+ version.image_coverage.query(3),
+ Some("Layer 20".to_string())
+ );
+ assert_eq!(
+ version.image_coverage.query(5),
+ Some("Layer 11".to_string())
+ );
+ assert_eq!(
+ version.image_coverage.query(7),
+ Some("Layer 11".to_string())
+ );
+ assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+ let mut map = HistoricLayerCoverage::::new();
+
+ // Add 3 key-disjoint layers with varying LSN ranges
+ map.insert(
+ LayerKey {
+ key: 1..2,
+ lsn: 100..200,
+ is_image: true,
+ },
+ "Layer 1".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 4..5,
+ lsn: 110..200,
+ is_image: true,
+ },
+ "Layer 2".to_string(),
+ );
+ map.insert(
+ LayerKey {
+ key: 7..8,
+ lsn: 120..300,
+ is_image: true,
+ },
+ "Layer 3".to_string(),
+ );
+
+ // Add wide and short layer
+ map.insert(
+ LayerKey {
+ key: 0..9,
+ lsn: 130..199,
+ is_image: true,
+ },
+ "Layer 4".to_string(),
+ );
+
+ // Add wide layer taller than some
+ map.insert(
+ LayerKey {
+ key: 0..9,
+ lsn: 140..201,
+ is_image: true,
+ },
+ "Layer 5".to_string(),
+ );
+
+ // Add wide layer taller than all
+ map.insert(
+ LayerKey {
+ key: 0..9,
+ lsn: 150..301,
+ is_image: true,
+ },
+ "Layer 6".to_string(),
+ );
+
+ // After layer 4 insertion
+ let version = map.get_version(135).unwrap();
+ assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+ assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+ assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+ assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+ assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+ assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+ assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+ // After layer 5 insertion
+ let version = map.get_version(145).unwrap();
+ assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+ assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+ assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+ assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+ assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+ assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+ assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+ // After layer 6 insertion
+ let version = map.get_version(155).unwrap();
+ assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+ assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+/// for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage {
+ /// A persistent layer map that we rebuild when we need to retroactively update
+ historic_coverage: HistoricLayerCoverage